[AMDGPU] Disable generic DAG combines at -O0 to preserve debuggability. (#176304)
Disable generic DAG combines for AMDGPU at -O0 via disableGenericCombines() to preserve instructions that users may want to set breakpoints on during debugging. Assisted-by: Cursor / Claude Opus 4.6
This commit is contained in:
parent
c5363f2166
commit
6bf794a02a
@ -15,7 +15,7 @@
|
||||
// RUN: -disable-llvm-passes -o - %s \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
|
||||
// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
|
||||
// RUN: -O3 -o - %s \
|
||||
@ -44,7 +44,7 @@
|
||||
// RUN: -ffp-contract=fast -disable-llvm-passes -o - %s \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
|
||||
// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -ffp-contract=fast \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
|
||||
@ -79,7 +79,7 @@
|
||||
// RUN: -ffp-contract=fast-honor-pragmas -disable-llvm-passes -o - %s \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
|
||||
// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -ffp-contract=fast-honor-pragmas \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
|
||||
@ -117,7 +117,7 @@
|
||||
// RUN: -ffp-contract=on -disable-llvm-passes -o - %s \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,NV-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -S \
|
||||
// RUN: -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -O1 -target-cpu gfx906 -disable-llvm-passes -o - -x hip %s \
|
||||
// RUN: -ffp-contract=on \
|
||||
// RUN: | FileCheck -check-prefixes=COMMON,AMD-ON %s
|
||||
// RUN: %clang_cc1 -fcuda-is-device -triple nvptx-nvidia-cuda -S \
|
||||
@ -180,7 +180,7 @@ __host__ __device__ float func(float a, float b, float c) { return a + b * c; }
|
||||
// COMMON-LABEL: _Z4funcfff
|
||||
// NV-ON: fma.rn.f32
|
||||
// NV-ON-NEXT: st.param.b32
|
||||
// AMD-ON: v_fmac_f32_e64
|
||||
// AMD-ON: v_fmac_f32_e32
|
||||
// AMD-ON-NEXT: s_setpc_b64
|
||||
|
||||
// NV-OFF: mul.rn.f32
|
||||
|
||||
@ -50,6 +50,13 @@ public:
|
||||
|
||||
~AMDGPUSelectionDAGInfo() override;
|
||||
|
||||
bool disableGenericCombines(CodeGenOptLevel OptLevel) const override {
|
||||
// Disable generic DAG combines at -O0 to preserve debuggability.
|
||||
// This prevents optimizations like constant reassociation that would
|
||||
// eliminate intermediate instructions users want to step through.
|
||||
return OptLevel == CodeGenOptLevel::None;
|
||||
}
|
||||
|
||||
const char *getTargetNodeName(unsigned Opcode) const override;
|
||||
|
||||
void verifyTargetNode(const SelectionDAG &DAG,
|
||||
|
||||
@ -458,18 +458,15 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
|
||||
; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
|
||||
; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
|
||||
; GFX908-NEXT: s_cmp_eq_u32 s7, s5
|
||||
; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX908-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GFX908-NEXT: s_mov_b32 s6, 1
|
||||
; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
|
||||
; GFX908-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7]
|
||||
; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4
|
||||
; GFX908-NEXT: s_cbranch_vccnz .LBB4_3
|
||||
; GFX908-NEXT: .LBB4_1: ; %Flow
|
||||
; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX908-NEXT: s_mov_b32 s4, 1
|
||||
; GFX908-NEXT: v_cmp_ne_u32_e64 s[4:5], v2, s4
|
||||
; GFX908-NEXT: s_mov_b64 s[6:7], -1
|
||||
; GFX908-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GFX908-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GFX908-NEXT: s_cbranch_vccnz .LBB4_4
|
||||
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.private
|
||||
@ -507,18 +504,15 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
|
||||
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
|
||||
; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
|
||||
; GFX90A-NEXT: s_cmp_eq_u32 s7, s5
|
||||
; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GFX90A-NEXT: s_mov_b32 s6, 1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
|
||||
; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_cbranch_vccnz .LBB4_3
|
||||
; GFX90A-NEXT: .LBB4_1: ; %Flow
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
|
||||
; GFX90A-NEXT: s_mov_b32 s4, 1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], -1
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_vccnz .LBB4_4
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.private
|
||||
@ -556,23 +550,19 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
|
||||
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
|
||||
; GFX942-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], -1
|
||||
; GFX942-NEXT: s_mov_b32 s2, 1
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
|
||||
; GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
|
||||
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX942-NEXT: s_cbranch_vccnz .LBB4_3
|
||||
; GFX942-NEXT: .LBB4_1: ; %Flow
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b32 s0, 1
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0
|
||||
; GFX942-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GFX942-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_vccnz .LBB4_4
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.private
|
||||
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NEXT: s_nop 1
|
||||
; GFX942-NEXT: scratch_load_dwordx2 v[2:3], off, s0
|
||||
; GFX942-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
|
||||
@ -603,18 +593,15 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
|
||||
; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
|
||||
; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
|
||||
; GFX1100-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX1100-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; GFX1100-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1100-NEXT: s_mov_b32 s0, -1
|
||||
; GFX1100-NEXT: s_mov_b32 s1, 1
|
||||
; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1
|
||||
; GFX1100-NEXT: s_xor_b32 s1, s1, s0
|
||||
; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4
|
||||
; GFX1100-NEXT: s_cbranch_vccnz .LBB4_3
|
||||
; GFX1100-NEXT: .LBB4_1: ; %Flow
|
||||
; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; GFX1100-NEXT: s_mov_b32 s0, 1
|
||||
; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, v2, s0
|
||||
; GFX1100-NEXT: s_mov_b32 s1, -1
|
||||
; GFX1100-NEXT: s_xor_b32 s0, s0, s1
|
||||
; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
||||
; GFX1100-NEXT: s_cbranch_vccnz .LBB4_4
|
||||
; GFX1100-NEXT: ; %bb.2: ; %atomicrmw.private
|
||||
@ -658,23 +645,20 @@ define i64 @optnone_atomicrmw_add_i64_expand(i64 %val) #1 {
|
||||
; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX1200-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; GFX1200-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1200-NEXT: s_mov_b32 s0, -1
|
||||
; GFX1200-NEXT: s_mov_b32 s1, 1
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1
|
||||
; GFX1200-NEXT: s_xor_b32 s1, s1, s0
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: s_cbranch_vccnz .LBB4_3
|
||||
; GFX1200-NEXT: .LBB4_1: ; %Flow
|
||||
; GFX1200-NEXT: s_mov_b32 s1, -1
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; GFX1200-NEXT: s_mov_b32 s0, 1
|
||||
; GFX1200-NEXT: s_xor_b32 s0, s0, s1
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: v_cmp_ne_u32_e64 s0, v2, s0
|
||||
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: s_cbranch_vccnz .LBB4_4
|
||||
@ -722,11 +706,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX908-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
|
||||
; GFX908-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
|
||||
; GFX908-NEXT: s_cmp_eq_u32 s7, s5
|
||||
; GFX908-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX908-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX908-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX908-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GFX908-NEXT: s_mov_b32 s6, 1
|
||||
; GFX908-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
|
||||
; GFX908-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GFX908-NEXT: s_and_b64 vcc, exec, s[6:7]
|
||||
; GFX908-NEXT: ; implicit-def: $vgpr3_vgpr4
|
||||
; GFX908-NEXT: s_cbranch_vccnz .LBB5_2
|
||||
@ -789,18 +771,15 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
|
||||
; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
|
||||
; GFX90A-NEXT: s_cmp_eq_u32 s7, s5
|
||||
; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GFX90A-NEXT: s_mov_b32 s6, 1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
|
||||
; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_3
|
||||
; GFX90A-NEXT: .LBB5_1: ; %Flow4
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
|
||||
; GFX90A-NEXT: s_mov_b32 s4, 1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[4:5], v4, s4
|
||||
; GFX90A-NEXT: s_mov_b64 s[6:7], -1
|
||||
; GFX90A-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GFX90A-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_10
|
||||
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.shared
|
||||
@ -812,11 +791,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX90A-NEXT: s_add_u32 s6, s6, global@rel32@lo+4
|
||||
; GFX90A-NEXT: s_addc_u32 s7, s7, global@rel32@hi+12
|
||||
; GFX90A-NEXT: s_cmp_eq_u32 s7, s5
|
||||
; GFX90A-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
; GFX90A-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX90A-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GFX90A-NEXT: s_mov_b32 s6, 1
|
||||
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[6:7], v2, s6
|
||||
; GFX90A-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; GFX90A-NEXT: s_and_b64 vcc, exec, s[6:7]
|
||||
; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX90A-NEXT: s_cbranch_vccnz .LBB5_5
|
||||
@ -881,18 +858,15 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
|
||||
; GFX942-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], -1
|
||||
; GFX942-NEXT: s_mov_b32 s2, 1
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
|
||||
; GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
|
||||
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX942-NEXT: s_cbranch_vccnz .LBB5_3
|
||||
; GFX942-NEXT: .LBB5_1: ; %Flow4
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: s_mov_b32 s0, 1
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[0:1], v4, s0
|
||||
; GFX942-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GFX942-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GFX942-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GFX942-NEXT: s_cbranch_vccnz .LBB5_10
|
||||
; GFX942-NEXT: ; %bb.2: ; %atomicrmw.shared
|
||||
@ -904,11 +878,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX942-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
|
||||
; GFX942-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
|
||||
; GFX942-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
|
||||
; GFX942-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GFX942-NEXT: s_mov_b64 s[0:1], -1
|
||||
; GFX942-NEXT: s_mov_b32 s2, 1
|
||||
; GFX942-NEXT: v_cmp_ne_u32_e64 s[2:3], v2, s2
|
||||
; GFX942-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1]
|
||||
; GFX942-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; GFX942-NEXT: ; implicit-def: $vgpr2_vgpr3
|
||||
; GFX942-NEXT: s_cbranch_vccnz .LBB5_5
|
||||
@ -971,11 +943,9 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX1100-NEXT: s_add_u32 s2, s2, global@rel32@lo+4
|
||||
; GFX1100-NEXT: s_addc_u32 s3, s3, global@rel32@hi+12
|
||||
; GFX1100-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX1100-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; GFX1100-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1100-NEXT: s_mov_b32 s0, -1
|
||||
; GFX1100-NEXT: s_mov_b32 s1, 1
|
||||
; GFX1100-NEXT: v_cmp_ne_u32_e64 s1, v2, s1
|
||||
; GFX1100-NEXT: s_xor_b32 s1, s1, s0
|
||||
; GFX1100-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; GFX1100-NEXT: ; implicit-def: $vgpr3_vgpr4
|
||||
; GFX1100-NEXT: s_cbranch_vccnz .LBB5_2
|
||||
@ -1043,13 +1013,11 @@ define double @optnone_atomicrmw_fadd_f64_expand(double %val) #1 {
|
||||
; GFX1200-NEXT: s_add_co_ci_u32 s3, s3, global@rel32@hi+24
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: s_cmp_eq_u32 s3, s1
|
||||
; GFX1200-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0
|
||||
; GFX1200-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1200-NEXT: s_mov_b32 s0, -1
|
||||
; GFX1200-NEXT: s_mov_b32 s1, 1
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: v_cmp_ne_u32_e64 s1, v2, s1
|
||||
; GFX1200-NEXT: s_xor_b32 s1, s1, s0
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX1200-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; GFX1200-NEXT: ; implicit-def: $vgpr3_vgpr4
|
||||
; GFX1200-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
|
||||
@ -8,7 +8,8 @@
|
||||
; GCN-LABEL: {{^}}test_branch:
|
||||
; GCNNOOPT: v_writelane_b32
|
||||
; GCNNOOPT: v_writelane_b32
|
||||
; GCN: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]]
|
||||
; GCNNOOPT: s_cbranch_vccnz [[END:.LBB[0-9]+_[0-9]+]]
|
||||
; GCNOPT: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCNNOOPT: v_readlane_b32
|
||||
; GCNNOOPT: v_readlane_b32
|
||||
@ -31,11 +32,11 @@ end:
|
||||
|
||||
; GCN-LABEL: {{^}}test_brcc_i1:
|
||||
; GCN: s_load_{{dword|b32}} [[VAL:s[0-9]+]]
|
||||
; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
|
||||
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
|
||||
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
|
||||
; GCNOPT: s_bitcmp0_b32 [[VAL]], 0
|
||||
; GCNNOOPT: s_cmp_eq_u32
|
||||
; GCN: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]]
|
||||
; GCNNOOPT: s_cmp_{{eq|lg}}_u32
|
||||
; GCNNOOPT: s_cbranch_vccnz [[END:.LBB[0-9]+_[0-9]+]]
|
||||
; GCNOPT: s_cbranch_scc1 [[END:.LBB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN: buffer_store_{{dword|b32}}
|
||||
|
||||
|
||||
@ -10,8 +10,8 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
|
||||
; REGALLOC-NEXT: {{ $}}
|
||||
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr2, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
|
||||
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
|
||||
; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 49
|
||||
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
|
||||
; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 50
|
||||
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GE_I32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
|
||||
; REGALLOC-NEXT: renamable $vgpr0 = IMPLICIT_DEF
|
||||
; REGALLOC-NEXT: renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec
|
||||
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
|
||||
|
||||
@ -40,6 +40,7 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
|
||||
; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
|
||||
@ -47,13 +48,16 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN_DBG-NEXT: s_mov_b32 s0, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, -1
|
||||
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2
|
||||
; GCN_DBG-NEXT: s_cmp_eq_u32 s1, s2
|
||||
; GCN_DBG-NEXT: s_cselect_b64 s[2:3], -1, 0
|
||||
; GCN_DBG-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GCN_DBG-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5]
|
||||
; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3]
|
||||
; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1
|
||||
; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec
|
||||
; GCN_DBG-NEXT: s_mov_b64 exec, -1
|
||||
; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill
|
||||
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2
|
||||
; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2
|
||||
; GCN_DBG-NEXT: ; %bb.1: ; %for.exit
|
||||
; GCN_DBG-NEXT: s_endpgm
|
||||
; GCN_DBG-NEXT: .LBB0_2: ; %for.body
|
||||
@ -64,11 +68,11 @@ define amdgpu_kernel void @test_loop(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s1, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 32
|
||||
; GCN_DBG-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s3, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 m0, -1
|
||||
; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -141,6 +145,7 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
|
||||
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
|
||||
; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
|
||||
@ -160,11 +165,11 @@ define amdgpu_kernel void @loop_const_true(ptr addrspace(3) %ptr, i32 %n) nounwi
|
||||
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s1, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 32
|
||||
; GCN_DBG-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s3, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 m0, -1
|
||||
; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -224,6 +229,7 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
|
||||
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
|
||||
; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
|
||||
@ -243,11 +249,11 @@ define amdgpu_kernel void @loop_const_false(ptr addrspace(3) %ptr, i32 %n) nounw
|
||||
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s1, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 32
|
||||
; GCN_DBG-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s3, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 m0, -1
|
||||
; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -308,6 +314,7 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
|
||||
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
|
||||
; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
|
||||
@ -327,11 +334,11 @@ define amdgpu_kernel void @loop_const_undef(ptr addrspace(3) %ptr, i32 %n) nounw
|
||||
; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GCN_DBG-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 1
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s1, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 0x80
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s2, 32
|
||||
; GCN_DBG-NEXT: s_add_i32 s2, s0, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 s3, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s2, s2, s3
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s2
|
||||
; GCN_DBG-NEXT: s_mov_b32 m0, -1
|
||||
; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -407,6 +414,7 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN_DBG-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN_DBG-NEXT: s_load_dword s1, s[4:5], 0xa
|
||||
; GCN_DBG-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 0
|
||||
@ -440,11 +448,11 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind {
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s0, v2, 3
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s2, v2, 1
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s3, v2, 2
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s4, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s1, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s1, s0, s1
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s4
|
||||
; GCN_DBG-NEXT: s_mov_b32 s4, 0x80
|
||||
; GCN_DBG-NEXT: v_readlane_b32 s1, v2, 0
|
||||
; GCN_DBG-NEXT: s_mov_b32 s4, 32
|
||||
; GCN_DBG-NEXT: s_add_i32 s4, s0, s4
|
||||
; GCN_DBG-NEXT: s_mov_b32 s5, 2
|
||||
; GCN_DBG-NEXT: s_lshl_b32 s4, s4, s5
|
||||
; GCN_DBG-NEXT: s_add_i32 s1, s1, s4
|
||||
; GCN_DBG-NEXT: s_mov_b32 m0, -1
|
||||
; GCN_DBG-NEXT: v_mov_b32_e32 v0, s1
|
||||
|
||||
@ -49,6 +49,8 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
|
||||
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0
|
||||
@ -218,6 +220,8 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
|
||||
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: v_writelane_b32 v4, s0, 0
|
||||
@ -422,6 +426,8 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
|
||||
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
|
||||
; GCN-O0-NEXT: ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
|
||||
; GCN-O0-NEXT: v_writelane_b32 v4, s2, 0
|
||||
@ -434,12 +440,12 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
|
||||
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
||||
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GCN-O0-NEXT: s_mov_b32 s4, 2
|
||||
; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s4, v0
|
||||
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GCN-O0-NEXT: s_mov_b32 s4, 2
|
||||
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s4
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
|
||||
; GCN-O0-NEXT: s_mov_b32 s0, 1
|
||||
@ -660,36 +666,39 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
|
||||
; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000
|
||||
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GCN-O0-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b32 s0, 2
|
||||
; GCN-O0-NEXT: v_lshlrev_b32_e64 v2, s0, v0
|
||||
; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v0
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v3, v1
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GCN-O0-NEXT: s_mov_b32 s2, 2
|
||||
; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[1:2], s2
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_mov_b32 s2, s4
|
||||
; GCN-O0-NEXT: s_mov_b32 s2, s0
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GCN-O0-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-O0-NEXT: s_mov_b32 s4, s1
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v5, v3
|
||||
; GCN-O0-NEXT: v_add_i32_e64 v4, s[2:3], s2, v1
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-O0-NEXT: v_addc_u32_e64 v1, s[2:3], v1, v5, s[2:3]
|
||||
; GCN-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GCN-O0-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b32 s1, 0xf000
|
||||
; GCN-O0-NEXT: s_mov_b32 s2, 0
|
||||
; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
|
||||
; GCN-O0-NEXT: s_mov_b32 s3, s1
|
||||
; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; GCN-O0-NEXT: s_mov_b32 s2, 0xf000
|
||||
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
||||
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
|
||||
; GCN-O0-NEXT: s_mov_b32 s5, s2
|
||||
; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[4:7], 0 addr64
|
||||
; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0
|
||||
; GCN-O0-NEXT: buffer_store_dword v1, v[2:3], s[0:3], 0 addr64
|
||||
; GCN-O0-NEXT: s_mov_b32 s0, 1
|
||||
; GCN-O0-NEXT: v_cmp_le_u32_e64 s[0:1], v0, s0
|
||||
; GCN-O0-NEXT: s_mov_b64 s[2:3], exec
|
||||
; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1]
|
||||
; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3]
|
||||
@ -905,6 +914,8 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
|
||||
; GCN-O0-NEXT: s_add_u32 s12, s12, s11
|
||||
; GCN-O0-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
|
||||
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-O0-NEXT: v_writelane_b32 v3, s0, 0
|
||||
@ -1059,15 +1070,15 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 1
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 2
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 3
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: .LBB5_1: ; %bb1
|
||||
; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s8, v7, 2
|
||||
; GCN-O0-NEXT: v_readlane_b32 s9, v7, 3
|
||||
@ -1087,26 +1098,27 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 2
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 3
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
|
||||
; GCN-O0-NEXT: ; %bb.2: ; %bb2
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 6
|
||||
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 7
|
||||
; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5]
|
||||
; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b32 s6, 0
|
||||
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6
|
||||
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6
|
||||
; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s4
|
||||
; GCN-O0-NEXT: s_mov_b64 s[4:5], -1
|
||||
; GCN-O0-NEXT: s_xor_b64 s[4:5], s[6:7], s[4:5]
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 8
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 9
|
||||
; GCN-O0-NEXT: s_mov_b32 s4, 0
|
||||
@ -1125,18 +1137,18 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 10
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 11
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-O0-NEXT: s_cbranch_execz .LBB5_5
|
||||
; GCN-O0-NEXT: ; %bb.3: ; %bb4
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen
|
||||
@ -1158,25 +1170,17 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 12
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 13
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-O0-NEXT: s_cbranch_execz .LBB5_6
|
||||
; GCN-O0-NEXT: ; %bb.4: ; %bb8
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_mov_b32 s10, 0
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr4
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr5
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr9
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr5
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr8
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr5
|
||||
; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GCN-O0-NEXT: s_mov_b32 s5, s10
|
||||
; GCN-O0-NEXT: s_mov_b32 s6, s9
|
||||
; GCN-O0-NEXT: s_mov_b32 s7, s8
|
||||
; GCN-O0-NEXT: s_mov_b32 s8, 0
|
||||
; GCN-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7
|
||||
; GCN-O0-NEXT: s_mov_b32 s5, s8
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(1)
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GCN-O0-NEXT: v_mov_b32_e32 v1, s5
|
||||
@ -1189,10 +1193,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_branch .LBB5_6
|
||||
; GCN-O0-NEXT: .LBB5_5: ; %Flow2
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 10
|
||||
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 11
|
||||
@ -1212,10 +1216,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_branch .LBB5_7
|
||||
; GCN-O0-NEXT: .LBB5_6: ; %Flow
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 12
|
||||
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 13
|
||||
@ -1235,9 +1239,9 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_branch .LBB5_5
|
||||
; GCN-O0-NEXT: .LBB5_7: ; %bb10
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s6, v7, 8
|
||||
; GCN-O0-NEXT: v_readlane_b32 s7, v7, 9
|
||||
@ -1247,32 +1251,32 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_mov_b64 s[4:5], exec
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 16
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 17
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[4:5]
|
||||
; GCN-O0-NEXT: s_cbranch_execz .LBB5_9
|
||||
; GCN-O0-NEXT: ; %bb.8: ; %Flow1
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_mov_b64 s[4:5], 0
|
||||
; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s4, 14
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s5, 15
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: .LBB5_9: ; %Flow3
|
||||
; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s8, v7, 16
|
||||
; GCN-O0-NEXT: v_readlane_b32 s9, v7, 17
|
||||
@ -1296,9 +1300,9 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s6, 18
|
||||
; GCN-O0-NEXT: v_writelane_b32 v7, s7, 19
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
|
||||
; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(4)
|
||||
@ -1310,10 +1314,10 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
|
||||
; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
|
||||
; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1
|
||||
; GCN-O0-NEXT: ; %bb.10: ; %bb12
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1
|
||||
; GCN-O0-NEXT: s_or_saveexec_b64 s[12:13], -1
|
||||
; GCN-O0-NEXT: s_waitcnt expcnt(4)
|
||||
; GCN-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 ; 4-byte Folded Reload
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[14:15]
|
||||
; GCN-O0-NEXT: s_mov_b64 exec, s[12:13]
|
||||
; GCN-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-O0-NEXT: v_readlane_b32 s4, v7, 18
|
||||
; GCN-O0-NEXT: v_readlane_b32 s5, v7, 19
|
||||
|
||||
@ -110,12 +110,12 @@ endif:
|
||||
; GCN: [[LOOP:.LBB[0-9]+_[0-9]+]]:
|
||||
; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload
|
||||
; GCN: v_sub_i32_e32 v[[VAL_LOOP_RELOAD]], vcc, v[[VAL_LOOP_RELOAD]], v{{[0-9]+}}
|
||||
; GCN: s_cmp_lg_u32
|
||||
; GCN: s_cmp_{{lg|eq}}_u32
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: buffer_store_dword
|
||||
; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_cbranch_scc1 [[LOOP]]
|
||||
; GCN-NEXT: s_cbranch_vccnz [[LOOP]]
|
||||
|
||||
; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
|
||||
|
||||
|
||||
@ -8,16 +8,18 @@ define i32 @divergent_lshr_and_cmp(i32 %x) {
|
||||
; GCN-NEXT: liveins: $vgpr0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 2, [[COPY]], implicit $exec
|
||||
; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_AND_B32_e64_]], 0, implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2
|
||||
; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], killed [[S_MOV_B32_]], implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN-NEXT: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_AND_B32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
|
||||
; GCN-NEXT: [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
|
||||
; GCN-NEXT: S_BRANCH %bb.1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.1.out.true:
|
||||
; GCN-NEXT: successors: %bb.2(0x80000000)
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2
|
||||
; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_]], [[COPY]], implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 2
|
||||
; GCN-NEXT: [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 killed [[S_MOV_B32_2]], [[COPY]], implicit $exec
|
||||
; GCN-NEXT: S_BRANCH %bb.2
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.2.UnifiedReturnBlock:
|
||||
@ -45,40 +47,52 @@ define amdgpu_kernel void @uniform_opt_lshr_and_cmp(ptr addrspace(1) %out, i32 %
|
||||
; GCN-NEXT: liveins: $sgpr4_sgpr5
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
|
||||
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
|
||||
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
|
||||
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
|
||||
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from constant-pool + 36, align 4, addrspace 4)
|
||||
; GCN-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from constant-pool + 44, addrspace 4)
|
||||
; GCN-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 9, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
|
||||
; GCN-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]](p4), 11, 0 :: (dereferenceable invariant load (s32) from %ir.x.kernarg.offset, addrspace 4)
|
||||
; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX2_IMM1]]
|
||||
; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM]], killed [[S_MOV_B32_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 2, [[S_LOAD_DWORD_IMM]], implicit-def dead $scc
|
||||
; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_1]], 0, implicit-def $scc
|
||||
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[S_LOAD_DWORD_IMM1]], killed [[S_MOV_B32_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN-NEXT: S_CMP_LG_U32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def $scc
|
||||
; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY $scc
|
||||
; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_64_xexec = COPY [[COPY2]]
|
||||
; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
|
||||
; GCN-NEXT: S_CMP_EQ_U32 killed [[S_AND_B32_]], killed [[S_MOV_B32_1]], implicit-def $scc
|
||||
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
|
||||
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
|
||||
; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 [[COPY2]], killed [[S_MOV_B64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, killed [[S_XOR_B64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: $vcc = COPY [[S_AND_B64_]]
|
||||
; GCN-NEXT: S_CBRANCH_VCCNZ %bb.2, implicit $vcc
|
||||
; GCN-NEXT: S_BRANCH %bb.1
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.1.out.true:
|
||||
; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
|
||||
; GCN-NEXT: [[S_XOR_B64_:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
|
||||
; GCN-NEXT: [[S_XOR_B64_1:%[0-9]+]]:sreg_64_xexec = S_XOR_B64 [[COPY3]], killed [[S_MOV_B64_1]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1
|
||||
; GCN-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0
|
||||
; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
|
||||
; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY5]], %subreg.sub0, killed [[COPY4]], %subreg.sub1, killed [[S_MOV_B32_3]], %subreg.sub2, killed [[S_MOV_B32_2]], %subreg.sub3
|
||||
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_XOR_B64_]], implicit $exec
|
||||
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
|
||||
; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[S_XOR_B64_1]], implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 1
|
||||
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_]]
|
||||
; GCN-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[COPY6]], killed [[S_MOV_B32_4]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_1]]
|
||||
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY7]], killed [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
|
||||
; GCN-NEXT: S_ENDPGM 0
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: bb.2.out.else:
|
||||
; GCN-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1
|
||||
; GCN-NEXT: [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0
|
||||
; GCN-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
|
||||
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY7]], %subreg.sub0, killed [[COPY6]], %subreg.sub1, killed [[S_MOV_B32_5]], %subreg.sub2, killed [[S_MOV_B32_4]], %subreg.sub3
|
||||
; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[COPY3]], implicit $exec
|
||||
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[V_CNDMASK_B32_e64_1]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
|
||||
; GCN-NEXT: [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub1
|
||||
; GCN-NEXT: [[COPY9:%[0-9]+]]:sreg_32 = COPY [[COPY1]].sub0
|
||||
; GCN-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
|
||||
; GCN-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
|
||||
; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE killed [[COPY9]], %subreg.sub0, killed [[COPY8]], %subreg.sub1, killed [[S_MOV_B32_6]], %subreg.sub2, killed [[S_MOV_B32_5]], %subreg.sub3
|
||||
; GCN-NEXT: [[V_CNDMASK_B32_e64_1:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, [[COPY3]], implicit $exec
|
||||
; GCN-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 1
|
||||
; GCN-NEXT: [[COPY10:%[0-9]+]]:sreg_32 = COPY [[V_CNDMASK_B32_e64_1]]
|
||||
; GCN-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 killed [[COPY10]], killed [[S_MOV_B32_7]], implicit-def dead $scc
|
||||
; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_2]]
|
||||
; GCN-NEXT: BUFFER_STORE_BYTE_OFFSET killed [[COPY11]], killed [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (store (s8) into %ir.out.load, addrspace 1)
|
||||
; GCN-NEXT: S_ENDPGM 0
|
||||
entry:
|
||||
%0 = and i32 %x, 2
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -9,6 +9,106 @@ define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3
|
||||
; GFX950-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GFX950-NEXT: v_writelane_b32 v2, s33, 0
|
||||
; GFX950-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x188
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x180
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x178
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x170
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x168
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x160
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x158
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x150
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x148
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x140
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x138
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x130
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x128
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x120
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x118
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x110
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x108
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x100
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xf0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xe8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xe0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xc8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xc0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xa8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xa0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x98
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x90
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x88
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x80
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x78
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x70
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x68
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x60
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x58
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x50
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x48
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x40
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x10
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x18
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x20
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x28
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x30
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x38
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
|
||||
; GFX950-NEXT: s_load_dwordx2 vcc, s[2:3], 0x8
|
||||
; GFX950-NEXT: s_load_dwordx2 s[98:99], s[2:3], 0x10
|
||||
@ -237,6 +337,106 @@ define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %a
|
||||
; GFX950-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GFX950-NEXT: v_writelane_b32 v2, s0, 0
|
||||
; GFX950-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x188
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x180
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x178
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x170
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x168
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x160
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x158
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x150
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x148
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x140
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x138
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x130
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x128
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x120
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x118
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x110
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x108
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x100
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xf0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xe8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xe0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xc8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xc0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xb0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x98
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x90
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x88
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x80
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x78
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x70
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x68
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x60
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x58
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x50
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x48
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x40
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x8
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x10
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x18
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x20
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x28
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x30
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x38
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
|
||||
; GFX950-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX950-NEXT: v_writelane_b32 v2, s4, 1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -14,6 +14,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(ptr addrspace(1) %out) {
|
||||
; GCN-NEXT: liveins: $vgpr0, $sgpr4_sgpr5
|
||||
; GCN-NEXT: {{ $}}
|
||||
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
|
||||
; GCN-NEXT: dead early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from constant-pool + 36, align 4, addrspace 4)
|
||||
; GCN-NEXT: early-clobber renamable $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM_ec killed renamable $sgpr4_sgpr5, 36, 0 :: (dereferenceable invariant load (s64) from %ir.out.kernarg.offset, align 4, addrspace 4)
|
||||
; GCN-NEXT: renamable $sgpr6 = COPY renamable $sgpr1
|
||||
; GCN-NEXT: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1
|
||||
|
||||
@ -8,10 +8,13 @@ define <2 x i64> @f1() #0 {
|
||||
; GFX11-LABEL: f1:
|
||||
; GFX11: ; %bb.0:
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX11-NEXT: s_mov_b64 s[0:1], 0
|
||||
; GFX11-NEXT: s_mov_b32 s2, s0
|
||||
; GFX11-NEXT: s_mov_b32 s3, s1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
ret <2 x i64> zeroinitializer
|
||||
}
|
||||
@ -20,7 +23,7 @@ define void @f0() {
|
||||
; GFX11-LABEL: f0:
|
||||
; GFX11: ; %bb.0: ; %bb
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mov_b32 s2, s33
|
||||
; GFX11-NEXT: s_mov_b32 s16, s33
|
||||
; GFX11-NEXT: s_mov_b32 s33, s32
|
||||
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
|
||||
; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
|
||||
@ -41,7 +44,7 @@ define void @f0() {
|
||||
; GFX11-NEXT: s_xor_saveexec_b32 s0, -1
|
||||
; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
|
||||
; GFX11-NEXT: s_mov_b32 exec_lo, s0
|
||||
; GFX11-NEXT: s_mov_b32 s33, s2
|
||||
; GFX11-NEXT: s_mov_b32 s33, s16
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
||||
bb:
|
||||
@ -52,9 +55,9 @@ bb:
|
||||
define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
|
||||
; GFX11-LABEL: f2:
|
||||
; GFX11: ; %bb.0: ; %bb
|
||||
; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5]
|
||||
; GFX11-NEXT: s_mov_b64 s[18:19], s[4:5]
|
||||
; GFX11-NEXT: v_mov_b32_e32 v31, v0
|
||||
; GFX11-NEXT: s_load_b32 s19, s[16:17], 0x24
|
||||
; GFX11-NEXT: s_load_b32 s24, s[18:19], 0x24
|
||||
; GFX11-NEXT: s_mov_b32 s12, s13
|
||||
; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7]
|
||||
; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
@ -62,34 +65,34 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1]
|
||||
; GFX11-NEXT: s_mov_b32 s20, 0
|
||||
; GFX11-NEXT: s_mov_b32 s0, -1
|
||||
; GFX11-NEXT: s_mov_b32 s3, exec_lo
|
||||
; GFX11-NEXT: s_mov_b32 s17, exec_lo
|
||||
; GFX11-NEXT: s_mov_b32 s32, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_mul_lo_u32 v0, s19, v0
|
||||
; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
||||
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
|
||||
; GFX11-NEXT: s_cbranch_execz .LBB2_13
|
||||
; GFX11-NEXT: ; %bb.1: ; %bb14
|
||||
; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c
|
||||
; GFX11-NEXT: s_mov_b32 s18, 0
|
||||
; GFX11-NEXT: s_load_b128 s[20:23], s[18:19], 0x2c
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_bitcmp1_b32 s21, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s24, -1, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s25, -1, 0
|
||||
; GFX11-NEXT: s_bitcmp0_b32 s21, 0
|
||||
; GFX11-NEXT: s_mov_b32 s21, 0
|
||||
; GFX11-NEXT: s_cbranch_scc0 .LBB2_3
|
||||
; GFX11-NEXT: ; %bb.2: ; %bb15
|
||||
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
|
||||
; GFX11-NEXT: s_addc_u32 s9, s17, 0
|
||||
; GFX11-NEXT: s_add_u32 s8, s18, 0x58
|
||||
; GFX11-NEXT: s_addc_u32 s9, s19, 0
|
||||
; GFX11-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4
|
||||
; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
|
||||
; GFX11-NEXT: s_mov_b32 s13, s14
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
|
||||
; GFX11-NEXT: s_mov_b32 s21, s14
|
||||
; GFX11-NEXT: s_mov_b32 s26, s14
|
||||
; GFX11-NEXT: s_mov_b32 s14, s15
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-NEXT: s_mov_b32 s14, s21
|
||||
; GFX11-NEXT: s_mov_b32 s14, s26
|
||||
; GFX11-NEXT: s_mov_b32 s2, -1
|
||||
; GFX11-NEXT: s_cbranch_execz .LBB2_4
|
||||
; GFX11-NEXT: s_branch .LBB2_12
|
||||
@ -98,18 +101,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0
|
||||
; GFX11-NEXT: s_cbranch_vccnz .LBB2_12
|
||||
; GFX11-NEXT: .LBB2_4: ; %bb16
|
||||
; GFX11-NEXT: s_load_b32 s0, s[16:17], 0x54
|
||||
; GFX11-NEXT: s_load_b32 s0, s[18:19], 0x54
|
||||
; GFX11-NEXT: s_bitcmp1_b32 s23, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
|
||||
; GFX11-NEXT: s_and_b32 s1, s23, 1
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_bitcmp1_b32 s0, 0
|
||||
; GFX11-NEXT: s_mov_b32 s0, -1
|
||||
; GFX11-NEXT: s_cselect_b32 s8, -1, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
|
||||
; GFX11-NEXT: s_cmp_eq_u32 s1, 0
|
||||
; GFX11-NEXT: s_cbranch_scc0 .LBB2_8
|
||||
; GFX11-NEXT: ; %bb.5: ; %bb18.preheader
|
||||
; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44
|
||||
; GFX11-NEXT: s_load_b128 s[28:31], s[18:19], 0x44
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: s_mul_hi_u32 s1, s29, s28
|
||||
@ -123,11 +126,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: s_mul_i32 s0, s0, s22
|
||||
; GFX11-NEXT: s_mul_i32 s0, s0, s20
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_or_b32 s0, s19, s0
|
||||
; GFX11-NEXT: s_lshl_b64 s[20:21], s[0:1], 1
|
||||
; GFX11-NEXT: s_or_b32 s0, s24, s0
|
||||
; GFX11-NEXT: s_lshl_b64 s[22:23], s[0:1], 1
|
||||
; GFX11-NEXT: s_mov_b32 s0, s1
|
||||
; GFX11-NEXT: global_load_u16 v1, v0, s[20:21]
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24
|
||||
; GFX11-NEXT: global_load_u16 v1, v0, s[22:23]
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25
|
||||
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
|
||||
@ -136,28 +139,28 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: .LBB2_6: ; %bb18
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_and_b32 s1, 0xffff, s1
|
||||
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s1, v0
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
|
||||
; GFX11-NEXT: s_and_b32 s13, s8, s13
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9
|
||||
; GFX11-NEXT: s_and_b32 s9, s3, s9
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
||||
; GFX11-NEXT: s_and_b32 s13, s13, exec_lo
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
|
||||
; GFX11-NEXT: s_cselect_b32 s1, s19, s1
|
||||
; GFX11-NEXT: s_and_b32 s9, s9, exec_lo
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s13, v2
|
||||
; GFX11-NEXT: s_cselect_b32 s1, s13, s1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 s1, s1, 1
|
||||
; GFX11-NEXT: s_and_b32 s13, 0xffff, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s13, -1, 0
|
||||
; GFX11-NEXT: s_and_b32 s20, s9, exec_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s13, v1
|
||||
; GFX11-NEXT: s_and_b32 s9, 0xffff, s0
|
||||
; GFX11-NEXT: s_cselect_b32 s9, -1, 0
|
||||
; GFX11-NEXT: s_and_b32 s16, s8, exec_lo
|
||||
; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s9, v1
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s19, v2
|
||||
; GFX11-NEXT: s_cselect_b32 s13, s19, s13
|
||||
; GFX11-NEXT: s_bitcmp1_b32 s13, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s13, 0x100, 0
|
||||
; GFX11-NEXT: v_readfirstlane_b32 s13, v2
|
||||
; GFX11-NEXT: s_cselect_b32 s9, s13, s9
|
||||
; GFX11-NEXT: s_bitcmp1_b32 s9, 0
|
||||
; GFX11-NEXT: s_cselect_b32 s9, 0x100, 0
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_or_b32 s0, s13, s0
|
||||
; GFX11-NEXT: s_or_b32 s0, s9, s0
|
||||
; GFX11-NEXT: s_cbranch_vccz .LBB2_6
|
||||
; GFX11-NEXT: ; %bb.7: ; %Flow
|
||||
; GFX11-NEXT: s_mov_b32 s0, 0
|
||||
@ -166,24 +169,24 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
||||
; GFX11-NEXT: s_cbranch_vccz .LBB2_12
|
||||
; GFX11-NEXT: ; %bb.9:
|
||||
; GFX11-NEXT: s_xor_b32 s0, s8, -1
|
||||
; GFX11-NEXT: s_xor_b32 s0, s3, -1
|
||||
; GFX11-NEXT: .LBB2_10: ; %bb17
|
||||
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
||||
; GFX11-NEXT: s_cbranch_vccz .LBB2_10
|
||||
; GFX11-NEXT: ; %bb.11: ; %Flow6
|
||||
; GFX11-NEXT: s_mov_b32 s18, -1
|
||||
; GFX11-NEXT: s_mov_b32 s21, -1
|
||||
; GFX11-NEXT: .LBB2_12: ; %Flow11
|
||||
; GFX11-NEXT: s_and_b32 s20, s2, exec_lo
|
||||
; GFX11-NEXT: s_or_not1_b32 s0, s18, exec_lo
|
||||
; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo
|
||||
; GFX11-NEXT: .LBB2_13: ; %Flow9
|
||||
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s3, s0
|
||||
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s17
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s17, s0
|
||||
; GFX11-NEXT: s_cbranch_execz .LBB2_15
|
||||
; GFX11-NEXT: ; %bb.14: ; %bb43
|
||||
; GFX11-NEXT: s_add_u32 s8, s16, 0x58
|
||||
; GFX11-NEXT: s_addc_u32 s9, s17, 0
|
||||
; GFX11-NEXT: s_add_u32 s8, s18, 0x58
|
||||
; GFX11-NEXT: s_addc_u32 s9, s19, 0
|
||||
; GFX11-NEXT: s_getpc_b64 s[0:1]
|
||||
; GFX11-NEXT: s_add_u32 s0, s0, f0@gotpcrel32@lo+4
|
||||
; GFX11-NEXT: s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
|
||||
@ -194,7 +197,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
|
||||
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
|
||||
; GFX11-NEXT: s_or_b32 s20, s20, exec_lo
|
||||
; GFX11-NEXT: .LBB2_15: ; %Flow14
|
||||
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
|
||||
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s17
|
||||
; GFX11-NEXT: s_and_saveexec_b32 s0, s20
|
||||
; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock
|
||||
; GFX11-NEXT: ; divergent unreachable
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -21,11 +21,18 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
|
||||
; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9]
|
||||
; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0
|
||||
; CHECK-NEXT: s_mov_b32 s9, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s8, s9
|
||||
; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s8, 1
|
||||
; CHECK-NEXT: v_writelane_b32 v40, s9, 2
|
||||
; CHECK-NEXT: ;;#ASMSTART
|
||||
; CHECK-NEXT: ; def vgpr10
|
||||
; CHECK-NEXT: ;;#ASMEND
|
||||
; CHECK-NEXT: s_add_i32 s8, s33, 0x100100
|
||||
; CHECK-NEXT: s_add_i32 s8, s33, 0x100000
|
||||
; CHECK-NEXT: s_nop 2
|
||||
; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s8 ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_mov_b64 s[18:19], 8
|
||||
; CHECK-NEXT: s_mov_b32 s8, s16
|
||||
@ -55,15 +62,12 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, s18
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; CHECK-NEXT: s_add_i32 s4, s33, 0x100100
|
||||
; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload
|
||||
; CHECK-NEXT: v_readlane_b32 s4, v40, 1
|
||||
; CHECK-NEXT: s_mov_b32 s5, 0
|
||||
; CHECK-NEXT: s_cmp_eq_u32 s4, s5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill
|
||||
; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
|
||||
; CHECK-NEXT: v_readlane_b32 s5, v40, 2
|
||||
; CHECK-NEXT: s_mov_b64 s[6:7], -1
|
||||
; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
|
||||
; CHECK-NEXT: ; %bb.1: ; %store
|
||||
; CHECK-NEXT: s_add_i32 s4, s33, 0x100000
|
||||
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload
|
||||
|
||||
@ -7,6 +7,7 @@ define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
|
||||
; MIR: bb.0 (%ir-block.0):
|
||||
; MIR-NEXT: liveins: $sgpr8_sgpr9
|
||||
; MIR-NEXT: {{ $}}
|
||||
; MIR-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from constant-pool, align 16, addrspace 4)
|
||||
; MIR-NEXT: renamable $sgpr4 = S_LOAD_DWORD_IMM killed renamable $sgpr8_sgpr9, 0, 0 :: (dereferenceable invariant load (s32) from %ir.val.kernarg.offset, align 16, addrspace 4)
|
||||
; MIR-NEXT: $m0 = S_MOV_B32 0
|
||||
; MIR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec, implicit $exec
|
||||
|
||||
@ -25,6 +25,12 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -118,6 +124,12 @@ define amdgpu_kernel void @dpp_test_bc(ptr addrspace(1) %out, i32 %in1, i32 %in2
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -215,21 +227,26 @@ define weak_odr amdgpu_kernel void @dpp_test1(ptr %arg) local_unnamed_addr {
|
||||
;
|
||||
; GFX8-NOOPT-LABEL: dpp_test1:
|
||||
; GFX8-NOOPT: ; %bb.0: ; %bb
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v3, 0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 2
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v0
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v3, s0, v2
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX8-NOOPT-NEXT: v_add_u32_e64 v3, s[4:5], v2, v3
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 m0, -1
|
||||
; GFX8-NOOPT-NEXT: ds_read_b32 v0, v3
|
||||
; GFX8-NOOPT-NEXT: ds_read_b32 v3, v3
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_barrier
|
||||
; GFX8-NOOPT-NEXT: v_add_u32_e64 v1, s[0:1], v0, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: v_add_u32_e64 v3, s[4:5], v3, v3
|
||||
; GFX8-NOOPT-NEXT: s_nop 1
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
|
||||
; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[0:1], v0, v1
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
|
||||
; GFX8-NOOPT-NEXT: v_add_u32_e64 v2, s[4:5], v2, v3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[3:4], s0, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3
|
||||
@ -358,14 +375,19 @@ define amdgpu_kernel void @update_dppi64_test(ptr addrspace(1) %arg, i64 %in1, i
|
||||
; GFX8-NOOPT-LABEL: update_dppi64_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5
|
||||
@ -484,14 +506,19 @@ define amdgpu_kernel void @update_dppf64_test(ptr addrspace(1) %arg, double %in1
|
||||
; GFX8-NOOPT-LABEL: update_dppf64_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5
|
||||
@ -610,14 +637,19 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
|
||||
; GFX8-NOOPT-LABEL: update_dppv2i32_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5
|
||||
@ -736,14 +768,19 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
|
||||
; GFX8-NOOPT-LABEL: update_dppv2f32_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5
|
||||
@ -862,14 +899,19 @@ define amdgpu_kernel void @update_dpp_p0_test(ptr addrspace(1) %arg, ptr %in1, p
|
||||
; GFX8-NOOPT-LABEL: update_dpp_p0_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s2, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s2, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s4
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s4, s5
|
||||
@ -985,6 +1027,12 @@ define amdgpu_kernel void @update_dpp_p3_test(ptr addrspace(3) %arg, ptr addrspa
|
||||
;
|
||||
; GFX8-NOOPT-LABEL: update_dpp_p3_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2
|
||||
@ -1092,6 +1140,12 @@ define amdgpu_kernel void @update_dpp_p5_test(ptr addrspace(5) %arg, ptr addrspa
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s91, 0xe80000
|
||||
; GFX8-NOOPT-NEXT: s_add_u32 s88, s88, s11
|
||||
; GFX8-NOOPT-NEXT: s_addc_u32 s89, s89, 0
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s1, s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[4:5], 0x28
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, 2
|
||||
@ -1194,13 +1248,16 @@ define amdgpu_kernel void @update_dppi64_imm_old_test(ptr addrspace(1) %arg, i64
|
||||
;
|
||||
; GFX8-NOOPT-LABEL: update_dppi64_imm_old_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s0, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3
|
||||
@ -1320,13 +1377,16 @@ define amdgpu_kernel void @update_dppf64_imm_old_test(ptr addrspace(1) %arg, dou
|
||||
;
|
||||
; GFX8-NOOPT-LABEL: update_dppf64_imm_old_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b32_e64 v1, s0, v0
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX8-NOOPT-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, 3
|
||||
; GFX8-NOOPT-NEXT: v_lshlrev_b64 v[1:2], s0, v[0:1]
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s0, s2
|
||||
; GFX8-NOOPT-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s2, s3
|
||||
@ -1447,6 +1507,10 @@ define amdgpu_kernel void @update_dppi64_imm_src_test(ptr addrspace(1) %out, i64
|
||||
; GFX8-NOOPT-LABEL: update_dppi64_imm_src_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1
|
||||
@ -1568,6 +1632,10 @@ define amdgpu_kernel void @update_dppf64_imm_src_test(ptr addrspace(1) %out, dou
|
||||
; GFX8-NOOPT-LABEL: update_dppf64_imm_src_test:
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_mov_b32 s8, s1
|
||||
@ -1686,6 +1754,12 @@ define amdgpu_kernel void @dpp_test_f32(ptr addrspace(1) %out, float %in1, float
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -1779,6 +1853,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb1(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -1872,6 +1952,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb2(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -1965,6 +2051,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb3(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2058,6 +2150,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb4(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2151,6 +2249,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb5(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2244,6 +2348,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb6(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2338,6 +2448,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb7(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2431,6 +2547,12 @@ define amdgpu_kernel void @dpp_test_f32_imm_comb8(ptr addrspace(1) %out, float %
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2524,6 +2646,12 @@ define amdgpu_kernel void @dpp_test_v2i16(ptr addrspace(1) %out, <2 x i16> %in1,
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2617,6 +2745,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb1(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2710,6 +2844,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb2(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2803,6 +2943,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb3(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2896,6 +3042,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb4(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -2989,6 +3141,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb5(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3082,6 +3240,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb6(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3175,6 +3339,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb7(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3268,6 +3438,12 @@ define amdgpu_kernel void @dpp_test_v2i16_imm_comb8(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3361,6 +3537,12 @@ define amdgpu_kernel void @dpp_test_v2f16(ptr addrspace(1) %out, <2 x half> %in1
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3454,6 +3636,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb1(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3547,6 +3735,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb2(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3640,6 +3834,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb3(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3733,6 +3933,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb4(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3826,6 +4032,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb5(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -3919,6 +4131,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb6(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -4012,6 +4230,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb7(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -4105,6 +4329,12 @@ define amdgpu_kernel void @dpp_test_v2f16_imm_comb8(ptr addrspace(1) %out, <2 x
|
||||
; GFX8-NOOPT: ; %bb.0:
|
||||
; GFX8-NOOPT-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s0, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s5, s[2:3], 0x2c
|
||||
; GFX8-NOOPT-NEXT: s_load_dword s4, s[2:3], 0x30
|
||||
; GFX8-NOOPT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
||||
@ -4,9 +4,9 @@
|
||||
|
||||
; GCN-LABEL: {{^}}test_debug_value:
|
||||
; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42
|
||||
; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; NOOPT-NEXT: .Ltmp
|
||||
; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5
|
||||
; NOOPT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; NOOPT: .Ltmp
|
||||
; NOOPT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5
|
||||
|
||||
; GCN: flat_store_dword
|
||||
; GCN: s_endpgm
|
||||
|
||||
@ -7,6 +7,9 @@
|
||||
define amdgpu_kernel void @load_constant_v3i64(ptr addrspace(1) %dst, ptr addrspace(4) %src) #0 {
|
||||
; CHECK-LABEL: load_constant_v3i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
|
||||
@ -32,7 +35,11 @@ define amdgpu_kernel void @load_constant_v3i64(ptr addrspace(1) %dst, ptr addrsp
|
||||
define amdgpu_kernel void @load_global_v3i64(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
|
||||
; CHECK-LABEL: load_global_v3i64:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v6, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x8
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -50,6 +57,9 @@ define amdgpu_kernel void @load_global_v3i64(ptr addrspace(1) %dst, ptr addrspac
|
||||
define amdgpu_kernel void @load_global_v3i64_invariant(ptr addrspace(1) %dst, ptr addrspace(1) %src) #0 {
|
||||
; CHECK-LABEL: load_global_v3i64_invariant:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -6,12 +6,16 @@
|
||||
define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
|
||||
; GFX12-LABEL: flat_last_use_load_0:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
@ -21,12 +25,16 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
|
||||
; GFX1250-LABEL: flat_last_use_load_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -39,17 +47,20 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
|
||||
; GFX12-LABEL: flat_last_use_load_1:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s2, 0x3ff
|
||||
; GFX12-NEXT: v_and_b32_e64 v0, v0, s2
|
||||
; GFX12-NEXT: v_ashrrev_i32_e64 v2, 31, v0
|
||||
; GFX12-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX12-NEXT: s_mov_b32 s2, 2
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s2, v0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_lshlrev_b64_e64 v[1:2], s2, v[0:1]
|
||||
; GFX12-NEXT: s_mov_b32 s3, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX12-NEXT: s_mov_b32 s2, s5
|
||||
@ -71,15 +82,19 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: flat_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -93,15 +108,19 @@ entry:
|
||||
define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
|
||||
; GFX12-LABEL: flat_last_use_and_volatile_load:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX12-NEXT: s_wait_dscnt 0x0
|
||||
@ -111,13 +130,17 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
|
||||
; GFX1250-LABEL: flat_last_use_and_volatile_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -129,12 +152,16 @@ entry:
|
||||
define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) {
|
||||
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
@ -144,12 +171,16 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
|
||||
; GFX1250-LABEL: flat_last_use_and_nontemporal_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: flat_load_b32 v1, v0, s[2:3] th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: flat_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -6,10 +6,13 @@
|
||||
define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: global_last_use_load_0:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, s2
|
||||
@ -19,10 +22,13 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr
|
||||
; GFX1250-LABEL: global_last_use_load_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
|
||||
@ -37,17 +43,34 @@ entry:
|
||||
define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: global_last_use_load_1:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_mov_b32 s4, 0x3ff
|
||||
; GFX12-NEXT: v_and_b32_e64 v1, v1, s4
|
||||
; GFX12-NEXT: s_mov_b32 s4, 2
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s2, 0x3ff
|
||||
; GFX12-NEXT: v_and_b32_e64 v1, v1, s2
|
||||
; GFX12-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
||||
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX12-NEXT: s_mov_b32 s2, 2
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2]
|
||||
; GFX12-NEXT: s_mov_b32 s3, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX12-NEXT: s_mov_b32 s2, s5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_add_co_u32 v1, s3, s3, v1
|
||||
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3
|
||||
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX12-NEXT: global_load_b32 v1, v[1:2], off th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
@ -56,15 +79,19 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -78,27 +105,35 @@ entry:
|
||||
define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: global_last_use_and_volatile_load:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: global_last_use_and_volatile_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_load_b32 v1, v0, s[2:3] th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -110,17 +145,34 @@ entry:
|
||||
define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: global_last_use_and_nontemporal_load:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_mov_b32 s4, 0x3ff
|
||||
; GFX12-NEXT: v_and_b32_e64 v1, v1, s4
|
||||
; GFX12-NEXT: s_mov_b32 s4, 2
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s4, v1
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
|
||||
; GFX12-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_mov_b32 s2, 0x3ff
|
||||
; GFX12-NEXT: v_and_b32_e64 v1, v1, s2
|
||||
; GFX12-NEXT: v_ashrrev_i32_e64 v3, 31, v1
|
||||
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX12-NEXT: s_mov_b32 s2, 2
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshlrev_b64_e64 v[2:3], s2, v[1:2]
|
||||
; GFX12-NEXT: s_mov_b32 s3, s4
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX12-NEXT: s_mov_b32 s2, s5
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_add_co_u32 v1, s3, s3, v1
|
||||
; GFX12-NEXT: s_wait_alu depctr_va_sdst(0)
|
||||
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s2, s2, v2, s3
|
||||
; GFX12-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
|
||||
; GFX12-NEXT: v_mov_b32_e32 v2, v3
|
||||
; GFX12-NEXT: global_load_b32 v1, v[1:2], off th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
@ -129,15 +181,19 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1)
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s4, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s4
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_load_b32 v1, v1, s[2:3] scale_offset th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -19,6 +19,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
|
||||
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
|
||||
; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
|
||||
; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -42,24 +43,30 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_nontemporal_load_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -68,10 +75,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_nontemporal_load_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-CU-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -81,6 +91,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
; SKIP-CACHE-INV-LABEL: local_nontemporal_load_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -101,10 +114,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
|
||||
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -113,10 +129,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_0:
|
||||
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -125,10 +144,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_0:
|
||||
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -137,10 +159,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX942-TGSPLIT-LABEL: local_nontemporal_load_0:
|
||||
; GFX942-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -149,10 +174,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_nontemporal_load_0:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -161,10 +189,13 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_nontemporal_load_0:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX11-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -173,38 +204,50 @@ define amdgpu_kernel void @local_nontemporal_load_0(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_nontemporal_load_0:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: local_nontemporal_load_0:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX12-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-CU-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: local_nontemporal_load_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX1250-NEXT: ds_load_b32 v1, v1
|
||||
; GFX1250-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(3) %in, ptr addrspace(1) %out) {
|
||||
@ -219,6 +262,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
|
||||
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
|
||||
; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
|
||||
; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -244,27 +288,33 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 s7, 2
|
||||
; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_nontemporal_load_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -274,10 +324,13 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX10-CU-LABEL: local_nontemporal_load_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX10-CU-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -287,6 +340,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; SKIP-CACHE-INV-LABEL: local_nontemporal_load_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -310,12 +366,15 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
|
||||
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -325,12 +384,15 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX90A-TGSPLIT-LABEL: local_nontemporal_load_1:
|
||||
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-TGSPLIT-NEXT: s_mov_b32 s7, 0x3ff
|
||||
; GFX90A-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s7
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -340,12 +402,15 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_load_1:
|
||||
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -355,12 +420,15 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX942-TGSPLIT-LABEL: local_nontemporal_load_1:
|
||||
; GFX942-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-TGSPLIT-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX942-TGSPLIT-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -370,12 +438,15 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX11-WGP-LABEL: local_nontemporal_load_1:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -385,12 +456,15 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX11-CU-LABEL: local_nontemporal_load_1:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX11-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -400,30 +474,38 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX12-WGP-LABEL: local_nontemporal_load_1:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: local_nontemporal_load_1:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX12-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-CU-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
@ -431,15 +513,19 @@ define amdgpu_kernel void @local_nontemporal_load_1(
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX1250-NEXT: ds_load_b32 v1, v1
|
||||
; GFX1250-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(3) %in, ptr addrspace(1) %out) {
|
||||
@ -454,6 +540,8 @@ entry:
|
||||
define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
; GFX6-LABEL: local_nontemporal_store_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5
|
||||
; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
@ -468,6 +556,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX7-LABEL: local_nontemporal_store_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -481,6 +572,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_nontemporal_store_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -493,6 +587,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_nontemporal_store_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -505,6 +602,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_nontemporal_store_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -518,6 +618,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
|
||||
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -530,6 +633,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_0:
|
||||
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -542,6 +648,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_0:
|
||||
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -554,6 +663,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX942-TGSPLIT-LABEL: local_nontemporal_store_0:
|
||||
; GFX942-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -566,6 +678,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_nontemporal_store_0:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -578,6 +693,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_nontemporal_store_0:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -590,6 +708,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_nontemporal_store_0:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
@ -602,6 +723,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
;
|
||||
; GFX12-CU-LABEL: local_nontemporal_store_0:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
@ -615,6 +739,9 @@ define amdgpu_kernel void @local_nontemporal_store_0(
|
||||
; GFX1250-LABEL: local_nontemporal_store_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
@ -634,6 +761,8 @@ entry:
|
||||
define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
; GFX6-LABEL: local_nontemporal_store_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX6-NEXT: ; kill: def $sgpr4 killed $sgpr5
|
||||
; GFX6-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
@ -650,6 +779,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX7-LABEL: local_nontemporal_store_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -665,6 +797,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_nontemporal_store_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -677,6 +812,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_nontemporal_store_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -689,6 +827,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_nontemporal_store_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -704,6 +845,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
|
||||
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -718,6 +862,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX90A-TGSPLIT-LABEL: local_nontemporal_store_1:
|
||||
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -732,6 +879,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_store_1:
|
||||
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -746,6 +896,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX942-TGSPLIT-LABEL: local_nontemporal_store_1:
|
||||
; GFX942-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s1, s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -760,6 +913,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_nontemporal_store_1:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -774,6 +930,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_nontemporal_store_1:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -788,6 +947,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_nontemporal_store_1:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
@ -802,6 +964,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
;
|
||||
; GFX12-CU-LABEL: local_nontemporal_store_1:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
@ -817,6 +982,9 @@ define amdgpu_kernel void @local_nontemporal_store_1(
|
||||
; GFX1250-LABEL: local_nontemporal_store_1:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
@ -843,6 +1011,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b64 s[4:5], s[8:9]
|
||||
; GFX6-NEXT: s_load_dword s8, s[4:5], 0x0
|
||||
; GFX6-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x2
|
||||
; GFX6-NEXT: ; kill: def $sgpr6 killed $sgpr8
|
||||
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -866,24 +1035,30 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -892,10 +1067,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-CU-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -905,6 +1083,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
; SKIP-CACHE-INV-LABEL: local_nontemporal_volatile_load:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -925,10 +1106,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX90A-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX90A-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -937,10 +1121,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX90A-TGSPLIT-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX90A-TGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -949,10 +1136,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX942-NOTTGSPLIT-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX942-NOTTGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX942-NOTTGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX942-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -961,10 +1151,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX942-TGSPLIT-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX942-TGSPLIT: ; %bb.0: ; %entry
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX942-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX942-TGSPLIT-NEXT: ds_read_b32 v1, v1
|
||||
; GFX942-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -973,10 +1166,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -985,10 +1181,13 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX11-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -997,38 +1196,50 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX12-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-CU-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: local_nontemporal_volatile_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX1250-NEXT: ds_load_b32 v1, v1
|
||||
; GFX1250-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(3) %in, ptr addrspace(1) %out) {
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -15,6 +15,7 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
||||
; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr4
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -38,24 +39,30 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_load_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -64,10 +71,13 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_load_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-CU-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -77,6 +87,9 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_load_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -97,10 +110,13 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_volatile_load_0:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -109,10 +125,13 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_volatile_load_0:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX11-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -121,38 +140,50 @@ define amdgpu_kernel void @local_volatile_load_0(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_volatile_load_0:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: local_volatile_load_0:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX12-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-CU-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: local_volatile_load_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX1250-NEXT: ds_load_b32 v1, v1
|
||||
; GFX1250-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(3) %in, ptr addrspace(1) %out) {
|
||||
@ -167,6 +198,7 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
||||
; GFX6-NEXT: ; kill: def $sgpr2 killed $sgpr4
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -192,27 +224,33 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 s7, 2
|
||||
; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: ds_read_b32 v2, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_load_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -222,10 +260,13 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX10-CU-LABEL: local_volatile_load_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX10-CU-NEXT: ds_read_b32 v1, v1
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -235,6 +276,9 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_load_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -258,12 +302,15 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX11-WGP-LABEL: local_volatile_load_1:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX11-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -273,12 +320,15 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX11-CU-LABEL: local_volatile_load_1:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX11-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -288,30 +338,38 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX12-WGP-LABEL: local_volatile_load_1:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX12-WGP-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX12-WGP-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: local_volatile_load_1:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX12-CU-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX12-CU-NEXT: ds_load_b32 v1, v1
|
||||
; GFX12-CU-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
@ -319,15 +377,19 @@ define amdgpu_kernel void @local_volatile_load_1(
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX1250-NEXT: ds_load_b32 v1, v1
|
||||
; GFX1250-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(3) %in, ptr addrspace(1) %out) {
|
||||
@ -342,6 +404,8 @@ entry:
|
||||
define amdgpu_kernel void @local_volatile_store_0(
|
||||
; GFX6-LABEL: local_volatile_store_0:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb
|
||||
; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
|
||||
@ -356,6 +420,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_store_0:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -369,6 +436,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_store_0:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -381,6 +451,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_store_0:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -393,6 +466,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_store_0:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -406,6 +482,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_volatile_store_0:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -418,6 +497,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_volatile_store_0:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -430,6 +512,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_volatile_store_0:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
@ -442,6 +527,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
;
|
||||
; GFX12-CU-LABEL: local_volatile_store_0:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
@ -455,6 +543,9 @@ define amdgpu_kernel void @local_volatile_store_0(
|
||||
; GFX1250-LABEL: local_volatile_store_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
@ -474,6 +565,8 @@ entry:
|
||||
define amdgpu_kernel void @local_volatile_store_1(
|
||||
; GFX6-LABEL: local_volatile_store_1:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb
|
||||
; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
|
||||
@ -490,6 +583,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_store_1:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -505,6 +601,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_store_1:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -517,6 +616,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_store_1:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -529,6 +631,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_store_1:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -544,6 +649,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_volatile_store_1:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -558,6 +666,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_volatile_store_1:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -572,6 +683,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_volatile_store_1:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
@ -586,6 +700,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
;
|
||||
; GFX12-CU-LABEL: local_volatile_store_1:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
@ -601,6 +718,9 @@ define amdgpu_kernel void @local_volatile_store_1(
|
||||
; GFX1250-LABEL: local_volatile_store_1:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
@ -641,10 +761,13 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; GFX7-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-NEXT: ds_read_b32 v1, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -655,9 +778,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX10-WGP-NEXT: ds_read_b32 v1, v0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -668,9 +794,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX10-CU-NEXT: ds_read_b32 v1, v0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -680,10 +809,13 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SKIP-CACHE-INV-NEXT: ds_read_b32 v1, v0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -694,9 +826,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX11-WGP-NEXT: ds_load_b32 v1, v0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -707,9 +842,12 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX11-CU-NEXT: ds_load_b32 v1, v0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -719,25 +857,33 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
;
|
||||
; GFX12-WGP-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX12-WGP-NEXT: ds_load_b32 v1, v0
|
||||
; GFX12-WGP-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-WGP-NEXT: ds_store_b32 v0, v1
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX12-CU-NEXT: ds_load_b32 v1, v0
|
||||
; GFX12-CU-NEXT: s_wait_dscnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX12-CU-NEXT: ds_store_b32 v0, v1
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
@ -745,12 +891,16 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
|
||||
; GFX1250-LABEL: local_volatile_workgroup_acquire_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX1250-NEXT: ds_load_b32 v1, v0
|
||||
; GFX1250-NEXT: s_wait_dscnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX1250-NEXT: ds_store_b32 v0, v1
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
@ -764,12 +914,14 @@ entry:
|
||||
define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX6-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX6: ; %bb.0: ; %entry
|
||||
; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GFX6-NEXT: s_load_dword s1, s[4:5], 0xa
|
||||
; GFX6-NEXT: ; kill: def $sgpr0 killed $sgpr1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 m0, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: ds_write_b32 v0, v1
|
||||
@ -778,6 +930,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX7-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x1
|
||||
; GFX7-NEXT: s_mov_b32 m0, -1
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -790,6 +946,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX10-WGP-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x4
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
|
||||
@ -802,6 +962,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX10-CU-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x4
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x4
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
|
||||
@ -814,6 +978,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store:
|
||||
; SKIP-CACHE-INV: ; %bb.0: ; %entry
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x1
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -826,6 +994,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX11-WGP-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -838,6 +1010,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX11-CU-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -850,6 +1026,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX12-WGP-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[4:5], 0x4
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -864,6 +1044,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX12-CU-LABEL: local_volatile_workgroup_release_store:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x4
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[4:5], 0x4
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -879,6 +1063,10 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x4 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x4 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, s1
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -6,24 +6,32 @@
|
||||
define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: private_last_use_load_0:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: private_last_use_load_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -36,7 +44,11 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
|
||||
; GFX12-LABEL: private_last_use_load_1:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_mov_b32 s3, 0x3ff
|
||||
@ -44,9 +56,9 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
|
||||
; GFX12-NEXT: s_mov_b32 s3, 2
|
||||
; GFX12-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-NEXT: v_lshlrev_b32_e64 v1, s3, v1
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: scratch_load_b32 v1, v1, s2 th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
@ -54,14 +66,18 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -75,27 +91,35 @@ entry:
|
||||
define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: private_last_use_and_volatile_load:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: private_last_use_and_volatile_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_BYPASS scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -107,24 +131,32 @@ entry:
|
||||
define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
||||
; GFX12-LABEL: private_last_use_and_nontemporal_load:
|
||||
; GFX12: ; %bb.0: ; %entry
|
||||
; GFX12-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU
|
||||
; GFX12-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: private_last_use_and_nontemporal_load:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 th:TH_LOAD_LU
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -20,6 +20,9 @@ define amdgpu_kernel void @private_volatile_load_0(
|
||||
; GFX6-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX6-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -44,12 +47,15 @@ define amdgpu_kernel void @private_volatile_load_0(
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
@ -59,13 +65,16 @@ define amdgpu_kernel void @private_volatile_load_0(
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
@ -73,13 +82,16 @@ define amdgpu_kernel void @private_volatile_load_0(
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
@ -92,6 +104,9 @@ define amdgpu_kernel void @private_volatile_load_0(
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -111,62 +126,80 @@ define amdgpu_kernel void @private_volatile_load_0(
|
||||
;
|
||||
; GFX11-WGP-LABEL: private_volatile_load_0:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: scratch_load_b32 v1, off, s2 glc dlc
|
||||
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-CU-LABEL: private_volatile_load_0:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: scratch_load_b32 v1, off, s2 glc dlc
|
||||
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-CU-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-WGP-LABEL: private_volatile_load_0:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
|
||||
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: private_volatile_load_0:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
|
||||
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
; GFX1250-LABEL: private_volatile_load_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: scratch_load_b32 v1, off, s2 scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
||||
@ -186,6 +219,9 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX6-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX6-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX6-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -212,14 +248,17 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
|
||||
; GFX7-NEXT: s_mov_b32 s7, 2
|
||||
; GFX7-NEXT: v_lshlrev_b32_e64 v0, s7, v0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_add_i32_e64 v0, s[6:7], s6, v0
|
||||
; GFX7-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen glc
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX7-NEXT: flat_store_dword v[0:1], v2
|
||||
@ -230,13 +269,16 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX10-WGP-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-WGP-NEXT: s_endpgm
|
||||
;
|
||||
@ -245,13 +287,16 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s6
|
||||
; GFX10-CU-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen glc dlc
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-CU-NEXT: s_endpgm
|
||||
;
|
||||
@ -264,6 +309,9 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -286,37 +334,47 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX11-WGP-LABEL: private_volatile_load_1:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-WGP-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX11-WGP-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX11-WGP-NEXT: scratch_load_b32 v1, v1, off glc dlc
|
||||
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-CU-LABEL: private_volatile_load_1:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-CU-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX11-CU-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_lshl_add_u32 v1, v1, 2, s2
|
||||
; GFX11-CU-NEXT: scratch_load_b32 v1, v1, off glc dlc
|
||||
; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-CU-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-WGP-LABEL: private_volatile_load_1:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-WGP-NEXT: s_mov_b32 s3, 0x3ff
|
||||
@ -324,18 +382,22 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX12-WGP-NEXT: s_mov_b32 s3, 2
|
||||
; GFX12-WGP-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-WGP-NEXT: v_lshlrev_b32_e64 v1, s3, v1
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
|
||||
; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-WGP-NEXT: s_endpgm
|
||||
;
|
||||
; GFX12-CU-LABEL: private_volatile_load_1:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX12-CU-NEXT: s_mov_b32 s3, 0x3ff
|
||||
@ -343,11 +405,11 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX12-CU-NEXT: s_mov_b32 s3, 2
|
||||
; GFX12-CU-NEXT: s_wait_alu depctr_sa_sdst(0)
|
||||
; GFX12-CU-NEXT: v_lshlrev_b32_e64 v1, s3, v1
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: scratch_load_b32 v1, v1, s2 scope:SCOPE_SYS
|
||||
; GFX12-CU-NEXT: s_wait_bvhcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_samplecnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX12-CU-NEXT: s_endpgm
|
||||
;
|
||||
@ -355,15 +417,19 @@ define amdgpu_kernel void @private_volatile_load_1(
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, v0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; GFX1250-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: scratch_load_b32 v1, v1, s2 scale_offset scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
ptr addrspace(5) %in, ptr addrspace(1) %out) {
|
||||
@ -384,9 +450,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
; GFX6-NEXT: s_mov_b32 s15, 0xe8f000
|
||||
; GFX6-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX6-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -399,9 +468,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
||||
@ -414,9 +486,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[6:7], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s5
|
||||
@ -429,9 +504,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[6:7], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: v_mov_b32_e32 v0, s5
|
||||
@ -448,9 +526,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -461,9 +542,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
;
|
||||
; GFX11-WGP-LABEL: private_volatile_store_0:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -473,9 +557,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
;
|
||||
; GFX11-CU-LABEL: private_volatile_store_0:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -485,9 +572,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
;
|
||||
; GFX12-WGP-LABEL: private_volatile_store_0:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -502,9 +592,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
;
|
||||
; GFX12-CU-LABEL: private_volatile_store_0:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -520,9 +613,12 @@ define amdgpu_kernel void @private_volatile_store_0(
|
||||
; GFX1250-LABEL: private_volatile_store_0:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, s1
|
||||
@ -546,6 +642,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
; GFX6-NEXT: s_mov_b32 s15, 0xe8f000
|
||||
; GFX6-NEXT: s_add_u32 s12, s12, s11
|
||||
; GFX6-NEXT: s_addc_u32 s13, s13, 0
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_load_dword s1, s[4:5], 0xb
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -563,6 +662,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
; GFX7: ; %bb.0: ; %entry
|
||||
; GFX7-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX7-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
|
||||
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -580,6 +682,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
; GFX10-WGP: ; %bb.0: ; %entry
|
||||
; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-WGP-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -595,6 +700,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
; GFX10-CU: ; %bb.0: ; %entry
|
||||
; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
|
||||
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
|
||||
; GFX10-CU-NEXT: s_load_dword s5, s[8:9], 0x8
|
||||
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -614,6 +722,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_add_u32 s12, s12, s11
|
||||
; SKIP-CACHE-INV-NEXT: s_addc_u32 s13, s13, 0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
|
||||
; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[4:5], 0x2
|
||||
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -629,6 +740,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
;
|
||||
; GFX11-WGP-LABEL: private_volatile_store_1:
|
||||
; GFX11-WGP: ; %bb.0: ; %entry
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-WGP-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -644,6 +758,9 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
;
|
||||
; GFX11-CU-LABEL: private_volatile_store_1:
|
||||
; GFX11-CU: ; %bb.0: ; %entry
|
||||
; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX11-CU-NEXT: s_load_b32 s1, s[4:5], 0x8
|
||||
; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -659,9 +776,12 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
;
|
||||
; GFX12-WGP-LABEL: private_volatile_store_1:
|
||||
; GFX12-WGP: ; %bb.0: ; %entry
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX12-WGP-NEXT: s_mov_b32 s2, 0x3ff
|
||||
; GFX12-WGP-NEXT: v_and_b32_e64 v0, v0, s2
|
||||
@ -681,9 +801,12 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
;
|
||||
; GFX12-CU-LABEL: private_volatile_store_1:
|
||||
; GFX12-CU: ; %bb.0: ; %entry
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
|
||||
; GFX12-CU-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
|
||||
; GFX12-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX12-CU-NEXT: s_mov_b32 s2, 0x3ff
|
||||
; GFX12-CU-NEXT: v_and_b32_e64 v0, v0, s2
|
||||
@ -704,9 +827,12 @@ define amdgpu_kernel void @private_volatile_store_1(
|
||||
; GFX1250-LABEL: private_volatile_store_1:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[2:3], 0x0
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s2, 0x3ff
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -17,6 +17,10 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
|
||||
; GCN-NEXT: s_mov_b32 s95, 0xe8f000
|
||||
; GCN-NEXT: s_add_u32 s92, s92, s11
|
||||
; GCN-NEXT: s_addc_u32 s93, s93, 0
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ; def s[4:11]
|
||||
@ -219,8 +223,12 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, s1
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB0_2
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
|
||||
@ -478,6 +486,10 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
|
||||
; GCN-NEXT: s_mov_b32 s55, 0xe8f000
|
||||
; GCN-NEXT: s_add_u32 s52, s52, s11
|
||||
; GCN-NEXT: s_addc_u32 s53, s53, 0
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ; def s[4:19]
|
||||
@ -581,8 +593,12 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
|
||||
; GCN-NEXT: s_mov_b64 exec, s[28:29]
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, s1
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB1_2
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB1_2
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1
|
||||
; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 ; 4-byte Folded Reload
|
||||
@ -721,6 +737,10 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
|
||||
; GCN-NEXT: s_mov_b32 s55, 0xe8f000
|
||||
; GCN-NEXT: s_add_u32 s52, s52, s11
|
||||
; GCN-NEXT: s_addc_u32 s53, s53, 0
|
||||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0xb
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
@ -825,8 +845,12 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, s1
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB2_2
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB2_2
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GCN-NEXT: buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
|
||||
@ -958,6 +982,8 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
|
||||
; GCN-NEXT: s_add_u32 s52, s52, s11
|
||||
; GCN-NEXT: s_addc_u32 s53, s53, 0
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x9
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
@ -1061,8 +1087,12 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
|
||||
; GCN-NEXT: s_mov_b64 exec, s[34:35]
|
||||
; GCN-NEXT: s_mov_b32 s1, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lg_u32 s0, s1
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
|
||||
; GCN-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[2:3], -1
|
||||
; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[0:1]
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB3_2
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1
|
||||
; GCN-NEXT: buffer_load_dword v31, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -7,15 +7,23 @@ define amdgpu_kernel void @respect_optnone(double %arg0, double %arg1, ptr addrs
|
||||
; CHECK-LABEL: respect_optnone:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
|
||||
; CHECK-NEXT: s_mov_b32 s6, 0x3ff
|
||||
; CHECK-NEXT: v_and_b32_e64 v0, v0, s6
|
||||
; CHECK-NEXT: v_ashrrev_i32_e64 v1, 31, v0
|
||||
; CHECK-NEXT: s_mov_b32 s6, 3
|
||||
; CHECK-NEXT: v_lshlrev_b32_e64 v0, s6, v0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: global_load_dwordx2 v[0:1], v0, s[4:5]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
|
||||
; CHECK-NEXT: v_lshl_add_u64 v[0:1], v[0:1], s6, v[2:3]
|
||||
; CHECK-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,OPT %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}scalar_to_vector_i16:
|
||||
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 42
|
||||
; GCN: buffer_store_short [[V]],
|
||||
; OPT: v_mov_b32_e32 [[V:v[0-9]+]], 42
|
||||
; OPT: buffer_store_short [[V]],
|
||||
define void @scalar_to_vector_i16() {
|
||||
%tmp = load <2 x i16>, ptr addrspace(5) poison
|
||||
%tmp1 = insertelement <2 x i16> %tmp, i16 42, i64 0
|
||||
@ -12,8 +12,8 @@ define void @scalar_to_vector_i16() {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}scalar_to_vector_f16:
|
||||
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
|
||||
; GCN: buffer_store_short [[V]],
|
||||
; OPT: v_mov_b32_e32 [[V:v[0-9]+]], 0x3c00
|
||||
; OPT: buffer_store_short [[V]],
|
||||
define void @scalar_to_vector_f16() {
|
||||
%tmp = load <2 x half>, ptr addrspace(5) poison
|
||||
%tmp1 = insertelement <2 x half> %tmp, half 1.0, i64 0
|
||||
|
||||
@ -21,6 +21,7 @@ define void @phi_vec1half_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
|
||||
; CHECK-NEXT: s_mov_b32 s9, s10
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[8:11], 0 addr64 offset:2
|
||||
; CHECK-NEXT: buffer_store_short v0, v[0:1], s[8:11], 0 addr64
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
@ -53,6 +54,7 @@ define void @phi_vec1half_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #
|
||||
; CHECK-NEXT: s_mov_b32 s5, s6
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
|
||||
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
@ -76,8 +78,10 @@ define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #
|
||||
; CHECK-NEXT: s_mov_b32 s4, s6
|
||||
; CHECK-NEXT: s_mov_b32 s5, s6
|
||||
; CHECK-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
|
||||
; CHECK-NEXT: s_mov_b32 s4, 16
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0xffff
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_and_b32_e64 v0, v0, s4
|
||||
; CHECK-NEXT: s_mov_b32 s4, 16
|
||||
; CHECK-NEXT: v_lshlrev_b32_e64 v0, s4, v0
|
||||
; CHECK-NEXT: ; %bb.1: ; %bb
|
||||
; CHECK-NEXT: v_mul_f32_e64 v0, 1.0, v0
|
||||
@ -88,6 +92,7 @@ define void @phi_vec1bf16_to_f32(ptr addrspace(1) %src, ptr addrspace(1) %dst) #
|
||||
; CHECK-NEXT: s_mov_b32 s4, s6
|
||||
; CHECK-NEXT: s_mov_b32 s5, s6
|
||||
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64 offset:2
|
||||
; CHECK-NEXT: buffer_store_short v0, v[2:3], s[4:7], 0 addr64
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
@ -116,6 +121,7 @@ define void @phi_vec1bf16_to_f32_with_const_folding(ptr addrspace(1) %dst) #0 {
|
||||
; CHECK-NEXT: s_mov_b32 s4, s6
|
||||
; CHECK-NEXT: s_mov_b32 s5, s6
|
||||
; CHECK-NEXT: buffer_store_short v2, v[0:1], s[4:7], 0 addr64 offset:2
|
||||
; CHECK-NEXT: buffer_store_short v0, v[0:1], s[4:7], 0 addr64
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
||||
entry:
|
||||
|
||||
@ -9,6 +9,10 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_add_u32 s0, s0, s17
|
||||
; GCN-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
; GCN-NEXT: ;;#ASMEND
|
||||
@ -109,8 +113,12 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
|
||||
; GCN-NEXT: s_mov_b64 exec, s[24:25]
|
||||
; GCN-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, s5
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
|
||||
; GCN-NEXT: s_cmp_eq_u32 s4, s5
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], -1
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB0_2
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1
|
||||
; GCN-NEXT: buffer_load_dword v23, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
|
||||
|
||||
@ -726,16 +726,20 @@ define void @spill_sgpr_with_sgpr_uses() #0 {
|
||||
; GCN-NEXT: s_mov_b32 s5, s4
|
||||
; GCN-NEXT: ; implicit-def: $vgpr254 : SGPR spill to VGPR lane
|
||||
; GCN-NEXT: v_writelane_b32 v254, s5, 0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
|
||||
; GCN-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GCN-NEXT: s_mov_b32 s5, 0
|
||||
; GCN-NEXT: s_cmp_lg_u32 s4, s5
|
||||
; GCN-NEXT: s_cbranch_scc1 .LBB3_2
|
||||
; GCN-NEXT: s_cmp_eq_u32 s4, s5
|
||||
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
|
||||
; GCN-NEXT: s_mov_b64 s[6:7], -1
|
||||
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; GCN-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; GCN-NEXT: s_cbranch_vccnz .LBB3_2
|
||||
; GCN-NEXT: ; %bb.1: ; %bb0
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1
|
||||
; GCN-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
|
||||
; GCN-NEXT: s_mov_b64 exec, s[8:9]
|
||||
; GCN-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_readlane_b32 s4, v254, 0
|
||||
; GCN-NEXT: ;;#ASMSTART
|
||||
|
||||
@ -7,43 +7,86 @@ declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.f16(<8 x half>, <16 x half
|
||||
define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
|
||||
; CHECK-LABEL: test_smfmac_f32_32x32x32_f16__vgpr:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
|
||||
; CHECK-NEXT: s_load_dwordx4 s[12:15], s[2:3], 0x34
|
||||
; CHECK-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x44
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: s_load_dword s2, s[2:3], 0x64
|
||||
; CHECK-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; CHECK-NEXT: v_and_b32_e64 v1, v1, s3
|
||||
; CHECK-NEXT: s_mov_b32 s3, 6
|
||||
; CHECK-NEXT: v_lshlrev_b32_e64 v8, s3, v1
|
||||
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x64
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:48
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34
|
||||
; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; CHECK-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x34
|
||||
; CHECK-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44
|
||||
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x64
|
||||
; CHECK-NEXT: s_mov_b32 s3, 0x3ff
|
||||
; CHECK-NEXT: v_and_b32_e64 v2, v1, s3
|
||||
; CHECK-NEXT: v_ashrrev_i32_e64 v1, 31, v2
|
||||
; CHECK-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v1
|
||||
; CHECK-NEXT: s_mov_b32 s3, 6
|
||||
; CHECK-NEXT: v_lshlrev_b64 v[2:3], s3, v[2:3]
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_lshl_add_u64 v[2:3], s[0:1], 0, v[2:3]
|
||||
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v7
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v12, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v13, v5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v14, v4
|
||||
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v15, v7
|
||||
; CHECK-NEXT: v_mov_b32_e32 v16, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v17, v5
|
||||
; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
|
||||
; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:32
|
||||
; CHECK-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, v17
|
||||
; CHECK-NEXT: v_mov_b32_e32 v6, v16
|
||||
; CHECK-NEXT: v_mov_b32_e32 v7, v15
|
||||
; CHECK-NEXT: v_mov_b32_e32 v8, v14
|
||||
; CHECK-NEXT: v_mov_b32_e32 v9, v13
|
||||
; CHECK-NEXT: v_mov_b32_e32 v10, v12
|
||||
; CHECK-NEXT: v_mov_b32_e32 v11, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v25, v11
|
||||
; CHECK-NEXT: v_mov_b32_e32 v26, v10
|
||||
; CHECK-NEXT: v_mov_b32_e32 v27, v9
|
||||
; CHECK-NEXT: v_mov_b32_e32 v28, v8
|
||||
; CHECK-NEXT: v_mov_b32_e32 v29, v7
|
||||
; CHECK-NEXT: v_mov_b32_e32 v30, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v8, v4
|
||||
; CHECK-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:32
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, v13
|
||||
; CHECK-NEXT: v_mov_b32_e32 v6, v12
|
||||
; CHECK-NEXT: v_mov_b32_e32 v7, v11
|
||||
; CHECK-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11_vgpr12_vgpr13 killed $exec
|
||||
; CHECK-NEXT: s_mov_b64 s[6:7], 32
|
||||
; CHECK-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, s[6:7]
|
||||
; CHECK-NEXT: global_load_dwordx4 v[12:15], v[2:3], off offset:16
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v15
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, v14
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v13
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, v12
|
||||
; CHECK-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 killed $exec
|
||||
; CHECK-NEXT: v_mov_b32_e32 v11, v7
|
||||
; CHECK-NEXT: v_mov_b32_e32 v12, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v13, v5
|
||||
; CHECK-NEXT: v_mov_b32_e32 v14, v4
|
||||
; CHECK-NEXT: v_mov_b32_e32 v15, v3
|
||||
; CHECK-NEXT: v_mov_b32_e32 v16, v2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v17, v1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v17
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, v16
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v15
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, v14
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, v13
|
||||
; CHECK-NEXT: v_mov_b32_e32 v6, v12
|
||||
; CHECK-NEXT: v_mov_b32_e32 v7, v11
|
||||
; CHECK-NEXT: v_mov_b32_e32 v24, v10
|
||||
; CHECK-NEXT: global_load_dwordx4 v[10:13], v8, s[0:1] offset:16
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v25, v13
|
||||
; CHECK-NEXT: v_mov_b32_e32 v26, v12
|
||||
; CHECK-NEXT: v_mov_b32_e32 v27, v11
|
||||
; CHECK-NEXT: v_mov_b32_e32 v28, v10
|
||||
; CHECK-NEXT: global_load_dwordx4 v[8:11], v8, s[0:1]
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v29, v11
|
||||
; CHECK-NEXT: v_mov_b32_e32 v30, v10
|
||||
; CHECK-NEXT: v_mov_b32_e32 v31, v9
|
||||
; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
|
||||
; CHECK-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 killed $exec
|
||||
; CHECK-NEXT: v_mov_b32_e32 v9, v31
|
||||
; CHECK-NEXT: v_mov_b32_e32 v10, v30
|
||||
@ -60,12 +103,12 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
|
||||
; CHECK-NEXT: v_mov_b32_e32 v21, v3
|
||||
; CHECK-NEXT: v_mov_b32_e32 v22, v2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v23, v1
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[12:13]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[14:15]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[10:11]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[8:9]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[6:7]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[4:5]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[2:3], s[16:17]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[4:5], s[18:19]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[30:31], s[14:15]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[28:29], s[12:13]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[26:27], s[10:11]
|
||||
; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[8:9]
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, s2
|
||||
; CHECK-NEXT: s_nop 1
|
||||
; CHECK-NEXT: v_smfmac_f32_32x32x32_f16 v[8:23], v[2:5], v[24:31], v1 cbsz:1 abid:2
|
||||
@ -78,7 +121,15 @@ define amdgpu_kernel void @test_smfmac_f32_32x32x32_f16__vgpr(ptr addrspace(1) %
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, v7
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, v6
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, v1
|
||||
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[0:1] offset:48
|
||||
; CHECK-NEXT: s_mov_b32 s2, s0
|
||||
; CHECK-NEXT: s_mov_b32 s3, s1
|
||||
; CHECK-NEXT: s_mov_b32 s5, s6
|
||||
; CHECK-NEXT: s_mov_b32 s4, s7
|
||||
; CHECK-NEXT: s_add_u32 s2, s2, s5
|
||||
; CHECK-NEXT: s_addc_u32 s4, s3, s4
|
||||
; CHECK-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
|
||||
; CHECK-NEXT: s_mov_b32 s3, s4
|
||||
; CHECK-NEXT: global_store_dwordx4 v0, v[2:5], s[2:3] offset:16
|
||||
; CHECK-NEXT: v_mov_b32_e32 v1, v19
|
||||
; CHECK-NEXT: v_mov_b32_e32 v6, v18
|
||||
; CHECK-NEXT: v_mov_b32_e32 v7, v17
|
||||
|
||||
@ -2,10 +2,9 @@
|
||||
; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -debug-only=branch-relaxation < %s 2>&1 | FileCheck --check-prefix=GFX10 %s
|
||||
|
||||
; GFX10: Basic blocks after relaxation
|
||||
; GFX10: %bb.0 offset=00000000 size=0x1c
|
||||
; GFX10: %bb.0 offset=00000000 size=0x28
|
||||
|
||||
; Each instruction in the following kernel is 4 bytes in size,
|
||||
; except s_load_b32 which is 8 bytes in size. Hence, 0x1c bytes in total.
|
||||
; At -O0 without DAG combines, more instructions are generated.
|
||||
define amdgpu_kernel void @test_sopk_size(i32 %var.mode) {
|
||||
; GFX10-LABEL: test_sopk_size:
|
||||
; GFX10: ; %bb.0:
|
||||
|
||||
@ -20,7 +20,7 @@
|
||||
; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; 4-byte Folded Spill
|
||||
; TOVMEM: s_mov_b64 exec, [[COPY_EXEC]]
|
||||
|
||||
; GCN: s_cbranch_scc1 [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
||||
; GCN: s_cbranch_vccnz [[ENDIF:.LBB[0-9]+_[0-9]+]]
|
||||
|
||||
; GCN: [[ENDIF]]:
|
||||
; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], [[M0_LANE]]
|
||||
|
||||
@ -25,15 +25,18 @@ define void @test() {
|
||||
; CHECK-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; CHECK-NEXT: s_mov_b64 s[4:5], -1
|
||||
; CHECK-NEXT: s_mov_b32 s7, 0
|
||||
; CHECK-NEXT: s_cmp_eq_u32 s6, s7
|
||||
; CHECK-NEXT: s_cmp_lg_u32 s6, s7
|
||||
; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; CHECK-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
|
||||
; CHECK-NEXT: s_and_b64 vcc, exec, s[6:7]
|
||||
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
|
||||
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
|
||||
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
|
||||
; CHECK-NEXT: s_mov_b64 s[10:11], exec
|
||||
; CHECK-NEXT: s_mov_b64 exec, -1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
; CHECK-NEXT: s_nop 0
|
||||
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
|
||||
; CHECK-NEXT: s_cbranch_vccnz .LBB0_5
|
||||
; CHECK-NEXT: ; %bb.4: ; %bb.4
|
||||
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
|
||||
; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
|
||||
@ -54,9 +57,8 @@ define void @test() {
|
||||
; CHECK-NEXT: s_mov_b64 exec, s[10:11]
|
||||
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
|
||||
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
|
||||
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; CHECK-NEXT: s_mov_b32 s4, 1
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4
|
||||
; CHECK-NEXT: s_mov_b64 s[6:7], -1
|
||||
; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_vccnz .LBB0_1
|
||||
; CHECK-NEXT: ; %bb.6: ; %bb.5
|
||||
|
||||
@ -5,14 +5,14 @@
|
||||
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x2(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
@ -33,7 +33,7 @@ ret:
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 0
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
@ -41,7 +41,7 @@ ret:
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x3(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
@ -63,7 +63,7 @@ ret:
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 1
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
@ -72,7 +72,7 @@ ret:
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x4(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
@ -95,7 +95,7 @@ ret:
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 2
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 3
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
@ -105,7 +105,7 @@ ret:
|
||||
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x5(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
@ -131,7 +131,7 @@ ret:
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 5
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 6
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 7
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
@ -143,7 +143,7 @@ ret:
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 7
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x8(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
@ -177,7 +177,7 @@ ret:
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 13
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 14
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 15
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
@ -197,7 +197,7 @@ ret:
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 15
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x16(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
@ -247,7 +247,7 @@ ret:
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 29
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 30
|
||||
; VGPR: v_writelane_b32 v{{[0-9]+}}, s{{[0-9]+}}, 31
|
||||
; VGPR: s_cbranch_scc1
|
||||
; VGPR: s_cbranch_vccnz
|
||||
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 0
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 1
|
||||
@ -283,7 +283,7 @@ ret:
|
||||
; VGPR: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 31
|
||||
|
||||
; VMEM: buffer_store_dword
|
||||
; VMEM: s_cbranch_scc1
|
||||
; VMEM: s_cbranch_vccnz
|
||||
|
||||
; VMEM: buffer_load_dword
|
||||
define amdgpu_kernel void @spill_sgpr_x32(ptr addrspace(1) %out, i32 %in) #0 {
|
||||
|
||||
@ -10,10 +10,39 @@ declare ptr addrspace(5) @llvm.stacksave.p5()
|
||||
declare void @llvm.stackrestore.p5(ptr addrspace(5))
|
||||
|
||||
define hidden void @stack_passed_argument([32 x i32], i32) {
|
||||
; GCN-LABEL: stack_passed_argument:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
; WAVE32-OPT-LABEL: stack_passed_argument:
|
||||
; WAVE32-OPT: ; %bb.0:
|
||||
; WAVE32-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WAVE32-OPT-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; WAVE64-OPT-LABEL: stack_passed_argument:
|
||||
; WAVE64-OPT: ; %bb.0:
|
||||
; WAVE64-OPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WAVE64-OPT-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; WAVE32-O0-LABEL: stack_passed_argument:
|
||||
; WAVE32-O0: ; %bb.0:
|
||||
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
|
||||
; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; WAVE32-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; WAVE64-O0-LABEL: stack_passed_argument:
|
||||
; WAVE64-O0: ; %bb.0:
|
||||
; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
|
||||
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; WAVE64-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; WAVE64-O0-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; WAVE32-WWM-PREALLOC-LABEL: stack_passed_argument:
|
||||
; WAVE32-WWM-PREALLOC: ; %bb.0:
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4
|
||||
; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v0, off, s[0:3], s32
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0)
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31]
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -810,6 +839,8 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) {
|
||||
; WAVE32-O0: ; %bb.0:
|
||||
; WAVE32-O0-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WAVE32-O0-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s1, s0
|
||||
; WAVE32-O0-NEXT: ;;#ASMSTART
|
||||
; WAVE32-O0-NEXT: ; use s1
|
||||
@ -822,6 +853,8 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) {
|
||||
; WAVE64-O0: ; %bb.0:
|
||||
; WAVE64-O0-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WAVE64-O0-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; WAVE64-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s1, s0
|
||||
; WAVE64-O0-NEXT: ;;#ASMSTART
|
||||
; WAVE64-O0-NEXT: ; use s1
|
||||
@ -834,6 +867,8 @@ define amdgpu_kernel void @kernel_stacksave_sgpr(ptr addrspace(5) %stack) {
|
||||
; WAVE32-WWM-PREALLOC: ; %bb.0:
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_load_dword s0, s[4:5], 0x0
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s1, s0
|
||||
; WAVE32-WWM-PREALLOC-NEXT: ;;#ASMSTART
|
||||
; WAVE32-WWM-PREALLOC-NEXT: ; use s1
|
||||
@ -950,6 +985,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s15, s32
|
||||
; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17
|
||||
; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s15 offset:4
|
||||
; WAVE32-O0-NEXT: ; implicit-def: $sgpr16
|
||||
; WAVE32-O0-NEXT: v_mov_b32_e32 v3, s16
|
||||
; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s15
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
|
||||
; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
|
||||
@ -1060,6 +1098,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s15, s32
|
||||
; WAVE64-O0-NEXT: v_mov_b32_e32 v3, 17
|
||||
; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], s15 offset:4
|
||||
; WAVE64-O0-NEXT: ; implicit-def: $sgpr16
|
||||
; WAVE64-O0-NEXT: v_mov_b32_e32 v3, s16
|
||||
; WAVE64-O0-NEXT: buffer_store_dword v3, off, s[24:27], s15
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
|
||||
; WAVE64-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
|
||||
@ -1171,6 +1212,9 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, s32
|
||||
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17
|
||||
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s15 offset:4
|
||||
; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr16
|
||||
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, s16
|
||||
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s15
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s15, stack_passed_argument@abs32@hi
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
|
||||
; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
|
||||
@ -1356,6 +1400,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s16, s32
|
||||
; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 17
|
||||
; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16 offset:4
|
||||
; WAVE32-O0-NEXT: ; implicit-def: $sgpr17
|
||||
; WAVE32-O0-NEXT: v_mov_b32_e32 v0, s17
|
||||
; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s18, stack_passed_argument@abs32@hi
|
||||
; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
|
||||
; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
|
||||
@ -1467,6 +1514,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s16, s32
|
||||
; WAVE64-O0-NEXT: v_mov_b32_e32 v0, 17
|
||||
; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16 offset:4
|
||||
; WAVE64-O0-NEXT: ; implicit-def: $sgpr17
|
||||
; WAVE64-O0-NEXT: v_mov_b32_e32 v0, s17
|
||||
; WAVE64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s16
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s18, stack_passed_argument@abs32@hi
|
||||
; WAVE64-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
|
||||
; WAVE64-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
|
||||
@ -1578,6 +1628,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, s32
|
||||
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, 17
|
||||
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s16 offset:4
|
||||
; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr17
|
||||
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v0, s17
|
||||
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v0, off, s[0:3], s16
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s18, stack_passed_argument@abs32@hi
|
||||
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
|
||||
; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
|
||||
@ -1674,5 +1727,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
|
||||
|
||||
|
||||
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
||||
; GCN: {{.*}}
|
||||
; WAVE32: {{.*}}
|
||||
; WAVE64: {{.*}}
|
||||
|
||||
@ -79,6 +79,8 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
|
||||
; HSA-TRAP-GFX1100-O0-LABEL: trap:
|
||||
; HSA-TRAP-GFX1100-O0: ; %bb.0:
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
@ -212,6 +214,8 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
|
||||
; HSA-TRAP-GFX1100-O0: ; %bb.0: ; %entry
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_mov_b64 s[2:3], s[0:1]
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: v_writelane_b32 v2, s2, 0
|
||||
@ -355,8 +359,12 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
|
||||
;
|
||||
; HSA-TRAP-GFX1100-O0-LABEL: trap_with_use_after:
|
||||
; HSA-TRAP-GFX1100-O0: ; %bb.0:
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: scratch_store_b32 off, v0, off offset:8 ; 4-byte Folded Spill
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
@ -477,6 +485,8 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
|
||||
; HSA-TRAP-GFX1100-O0-LABEL: debugtrap:
|
||||
; HSA-TRAP-GFX1100-O0: ; %bb.0:
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v0, 0
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: v_mov_b32_e32 v1, 1
|
||||
; HSA-TRAP-GFX1100-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
|
||||
@ -558,8 +558,8 @@ define amdgpu_kernel void @k256_w1_asm() #2561 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use512vgprs_codegen:
|
||||
; GFX1250: NumVgprs: 512
|
||||
; GFX1250: VGPRBlocks: 31
|
||||
; GFX1250: NumVgprs: 482
|
||||
; GFX1250: VGPRBlocks: 30
|
||||
define amdgpu_kernel void @use512vgprs_codegen(ptr %p) #2561 {
|
||||
%r0 = load volatile <512 x float>, ptr %p, align 1
|
||||
store volatile <512 x float> %r0, ptr %p
|
||||
@ -567,8 +567,8 @@ define amdgpu_kernel void @use512vgprs_codegen(ptr %p) #2561 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}use1024vgprs_codegen:
|
||||
; GFX1250: NumVgprs: 1024
|
||||
; GFX1250: VGPRBlocks: 63
|
||||
; GFX1250: NumVgprs: 998
|
||||
; GFX1250: VGPRBlocks: 62
|
||||
define amdgpu_kernel void @use1024vgprs_codegen(ptr %p) #1281 {
|
||||
%r0 = load volatile <1024 x float>, ptr %p, align 1
|
||||
store volatile <1024 x float> %r0, ptr %p
|
||||
|
||||
@ -46,7 +46,9 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
|
||||
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
||||
; CHECK-NEXT: v_and_b32_e64 v0, s4, v0
|
||||
; CHECK-NEXT: s_mov_b32 s4, 0
|
||||
; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4
|
||||
; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, s4
|
||||
; CHECK-NEXT: s_mov_b64 s[6:7], -1
|
||||
; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
|
||||
; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5]
|
||||
; CHECK-NEXT: s_cbranch_vccnz .LBB0_4
|
||||
; CHECK-NEXT: ; %bb.3: ; %bb201
|
||||
|
||||
@ -23,6 +23,8 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
|
||||
; CHECK-NEXT: v_readlane_b32 s14, v40, 0
|
||||
; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9]
|
||||
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, 42
|
||||
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s8
|
||||
|
||||
@ -7,6 +7,8 @@ define amdgpu_kernel void @single_atomic_rmw(ptr addrspace(1) %ptr) {
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
@ -32,6 +34,8 @@ define amdgpu_kernel void @atomic_rmw_back_to_back(ptr addrspace(1) %ptr) {
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
@ -80,7 +84,13 @@ define amdgpu_kernel void @atomic_rmw_with_alu(ptr addrspace(1) %ptr, i32 %a, i3
|
||||
; GFX1250-LABEL: atomic_rmw_with_alu:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0xc nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0xc nv
|
||||
@ -135,7 +145,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_global_load(ptr addrspace(1) %pt
|
||||
; GFX1250-LABEL: atomic_rmw_broken_by_global_load:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -187,7 +201,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_global_store(ptr addrspace(1) %p
|
||||
; GFX1250-LABEL: atomic_rmw_broken_by_global_store:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -242,7 +260,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_flat_load(ptr addrspace(1) %ptr,
|
||||
; GFX1250-LABEL: atomic_rmw_broken_by_flat_load:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -293,7 +315,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_flat_store(ptr addrspace(1) %ptr
|
||||
; GFX1250-LABEL: atomic_rmw_broken_by_flat_store:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -347,7 +373,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_smem_load(ptr addrspace(1) %ptr,
|
||||
; GFX1250-LABEL: atomic_rmw_broken_by_smem_load:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -401,7 +431,11 @@ define amdgpu_kernel void @atomic_rmw_broken_by_atomic_store(ptr addrspace(1) %p
|
||||
; GFX1250-LABEL: atomic_rmw_broken_by_atomic_store:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -462,7 +496,11 @@ define amdgpu_kernel void @atomic_rmw_with_lds_load(ptr addrspace(1) %ptr, ptr a
|
||||
; GFX1250-LABEL: atomic_rmw_with_lds_load:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -514,7 +552,11 @@ define amdgpu_kernel void @atomic_rmw_with_lds_store(ptr addrspace(1) %ptr, ptr
|
||||
; GFX1250-LABEL: atomic_rmw_with_lds_store:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -568,7 +610,11 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_load(ptr addrspace(1) %ptr,
|
||||
; GFX1250-LABEL: atomic_rmw_with_flat_lds_load:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -593,14 +639,17 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_load(ptr addrspace(1) %ptr,
|
||||
; GFX1250-NEXT: s_wait_storecnt 0x0
|
||||
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s3, -1
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
|
||||
; GFX1250-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GFX1250-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX1250-NEXT: s_mov_b32 s4, s7
|
||||
; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base
|
||||
; GFX1250-NEXT: s_mov_b32 s3, s9
|
||||
; GFX1250-NEXT: s_mov_b32 s5, -1
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s2, s5
|
||||
; GFX1250-NEXT: s_and_b32 s8, s5, exec_lo
|
||||
; GFX1250-NEXT: s_cselect_b32 s4, s3, s4
|
||||
; GFX1250-NEXT: s_mov_b32 s3, s6
|
||||
; GFX1250-NEXT: s_and_b32 s5, s5, exec_lo
|
||||
; GFX1250-NEXT: s_cselect_b32 s2, s2, s3
|
||||
; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
|
||||
; GFX1250-NEXT: s_mov_b32 s3, s4
|
||||
@ -631,7 +680,11 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_store(ptr addrspace(1) %ptr,
|
||||
; GFX1250-LABEL: atomic_rmw_with_flat_lds_store:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s2, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -656,14 +709,17 @@ define amdgpu_kernel void @atomic_rmw_with_flat_lds_store(ptr addrspace(1) %ptr,
|
||||
; GFX1250-NEXT: s_wait_storecnt 0x0
|
||||
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s3, -1
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s2, s3
|
||||
; GFX1250-NEXT: s_cselect_b32 s5, -1, 0
|
||||
; GFX1250-NEXT: s_mov_b64 s[6:7], 0
|
||||
; GFX1250-NEXT: s_mov_b32 s4, s7
|
||||
; GFX1250-NEXT: s_mov_b64 s[8:9], src_shared_base
|
||||
; GFX1250-NEXT: s_mov_b32 s3, s9
|
||||
; GFX1250-NEXT: s_mov_b32 s5, -1
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s2, s5
|
||||
; GFX1250-NEXT: s_and_b32 s8, s5, exec_lo
|
||||
; GFX1250-NEXT: s_cselect_b32 s4, s3, s4
|
||||
; GFX1250-NEXT: s_mov_b32 s3, s6
|
||||
; GFX1250-NEXT: s_and_b32 s5, s5, exec_lo
|
||||
; GFX1250-NEXT: s_cselect_b32 s2, s2, s3
|
||||
; GFX1250-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
|
||||
; GFX1250-NEXT: s_mov_b32 s3, s4
|
||||
@ -698,7 +754,13 @@ define amdgpu_kernel void @atomic_rmw_borken_by_async_lds_copy(ptr addrspace(1)
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_mov_b64 s[2:3], s[4:5]
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[2:3], 0x10 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[4:5], s[2:3], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
@ -758,7 +820,11 @@ define amdgpu_kernel void @multiple_atomic_rmw_blocks(ptr addrspace(1) %ptr1, pt
|
||||
; GFX1250-LABEL: multiple_atomic_rmw_blocks:
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
@ -823,6 +889,8 @@ define amdgpu_kernel void @different_atomic_ops(ptr addrspace(1) %ptr) {
|
||||
; GFX1250: ; %bb.0:
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX1250-NEXT: v_mov_b32_e32 v1, 1
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
@ -883,9 +951,13 @@ define amdgpu_kernel void @atomic_rmw_across_basic_blocks(ptr addrspace(1) %ptr,
|
||||
; GFX1250-LABEL: atomic_rmw_across_basic_blocks:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b64 s[4:5], s[2:3]
|
||||
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GFX1250-NEXT: v_writelane_b32 v2, s4, 0
|
||||
@ -917,8 +989,13 @@ define amdgpu_kernel void @atomic_rmw_across_basic_blocks(ptr addrspace(1) %ptr,
|
||||
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s1, 0
|
||||
; GFX1250-NEXT: s_cmp_lg_u32 s0, s1
|
||||
; GFX1250-NEXT: s_cbranch_scc1 .LBB16_2
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_cmp_eq_u32 s0, s1
|
||||
; GFX1250-NEXT: s_cselect_b32 s0, -1, 0
|
||||
; GFX1250-NEXT: s_mov_b32 s1, -1
|
||||
; GFX1250-NEXT: s_xor_b32 s0, s0, s1
|
||||
; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
||||
; GFX1250-NEXT: s_cbranch_vccnz .LBB16_2
|
||||
; GFX1250-NEXT: ; %bb.1: ; %then
|
||||
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX1250-NEXT: scratch_load_b32 v2, off, off nv ; 4-byte Folded Reload
|
||||
@ -963,6 +1040,9 @@ define amdgpu_kernel void @atomic_rmw_in_loop(ptr addrspace(1) %ptr, i32 %n) {
|
||||
; GFX1250-LABEL: atomic_rmw_in_loop:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
@ -1011,14 +1091,17 @@ define amdgpu_kernel void @atomic_rmw_in_loop(ptr addrspace(1) %ptr, i32 %n) {
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 s2, 1
|
||||
; GFX1250-NEXT: s_add_co_i32 s0, s0, s2
|
||||
; GFX1250-NEXT: s_cmp_lt_u32 s0, s1
|
||||
; GFX1250-NEXT: s_cmp_ge_u32 s0, s1
|
||||
; GFX1250-NEXT: s_cselect_b32 s1, -1, 0
|
||||
; GFX1250-NEXT: s_mov_b32 s2, -1
|
||||
; GFX1250-NEXT: s_xor_b32 s1, s1, s2
|
||||
; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s1
|
||||
; GFX1250-NEXT: v_writelane_b32 v2, s0, 3
|
||||
; GFX1250-NEXT: s_mov_b32 s6, exec_lo
|
||||
; GFX1250-NEXT: s_mov_b32 exec_lo, -1
|
||||
; GFX1250-NEXT: s_or_saveexec_b32 s6, -1
|
||||
; GFX1250-NEXT: scratch_store_b32 off, v2, off nv ; 4-byte Folded Spill
|
||||
; GFX1250-NEXT: s_wait_xcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX1250-NEXT: s_cbranch_scc1 .LBB17_1
|
||||
; GFX1250-NEXT: s_cbranch_vccnz .LBB17_1
|
||||
; GFX1250-NEXT: ; %bb.2: ; %exit
|
||||
; GFX1250-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -1043,6 +1126,9 @@ define amdgpu_kernel void @atomic_rmw_with_branch(ptr addrspace(1) %ptr, i32 %co
|
||||
; GFX1250-LABEL: atomic_rmw_with_branch:
|
||||
; GFX1250: ; %bb.0: ; %entry
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b32 s0, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_load_b64 s[2:3], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_load_b32 s1, s[4:5], 0x8 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
@ -1089,9 +1175,8 @@ define amdgpu_kernel void @atomic_rmw_with_branch(ptr addrspace(1) %ptr, i32 %co
|
||||
; GFX1250-NEXT: s_mov_b32 exec_lo, s6
|
||||
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
||||
; GFX1250-NEXT: v_readlane_b32 s0, v2, 2
|
||||
; GFX1250-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
|
||||
; GFX1250-NEXT: s_mov_b32 s0, 1
|
||||
; GFX1250-NEXT: v_cmp_ne_u32_e64 s0, v0, s0
|
||||
; GFX1250-NEXT: s_mov_b32 s1, -1
|
||||
; GFX1250-NEXT: s_xor_b32 s0, s0, s1
|
||||
; GFX1250-NEXT: s_and_b32 vcc_lo, exec_lo, s0
|
||||
; GFX1250-NEXT: s_cbranch_vccnz .LBB18_4
|
||||
; GFX1250-NEXT: ; %bb.2: ; %bb1
|
||||
@ -1194,6 +1279,8 @@ define amdgpu_kernel void @atomic_rmw_fallthrough(ptr addrspace(1) %ptr) {
|
||||
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 nv
|
||||
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
||||
; GFX1250-NEXT: s_mov_b64 s[2:3], s[0:1]
|
||||
; GFX1250-NEXT: ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
|
||||
; GFX1250-NEXT: v_writelane_b32 v2, s2, 0
|
||||
|
||||
@ -53,7 +53,7 @@ define amdgpu_gfx void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[40:41], v3, v4
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[40:41]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[40:41]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s35, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 2
|
||||
@ -220,7 +220,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v3
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[36:37]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s36, 2
|
||||
@ -533,9 +533,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
|
||||
; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
|
||||
; GFX9-O0-NEXT: s_nop 0
|
||||
; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
|
||||
@ -590,9 +590,9 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v0, s[34:35]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 32
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[4:5], s34, v[8:9]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, v4
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s34, v[8:9]
|
||||
; GFX9-O0-NEXT: s_getpc_b64 s[34:35]
|
||||
; GFX9-O0-NEXT: s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
|
||||
; GFX9-O0-NEXT: s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
|
||||
@ -601,8 +601,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[36:37], s[0:1]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[36:37]
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s34, v11, 4
|
||||
@ -630,8 +630,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
|
||||
@ -731,8 +731,11 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[36:39], s34 offen
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[36:39], s34 offen offset:16
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, 0x7fffffff
|
||||
; GFX9-O0-NEXT: s_mov_b32 s44, -1
|
||||
@ -741,7 +744,10 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
||||
; GFX9-O0-NEXT: s_mov_b32 s42, s45
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr46_sgpr47
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[40:41]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s35, s44
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
|
||||
@ -750,12 +756,18 @@ define amdgpu_gfx void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %in
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s42
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[40:41]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[40:41]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[40:41]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[40:41], -1
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s35
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[40:41]
|
||||
@ -1056,20 +1068,28 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v10
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 8
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v10, v1, s34
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v9
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v10, s[0:3], 0 offen offset:4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v8
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 16
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v8, v1, s34
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v8, s[0:3], 0 offen offset:4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 24
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v6, v1, s34
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen offset:4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
|
||||
; GFX9-O0-NEXT: s_mov_b32 s34, 32
|
||||
; GFX9-O0-NEXT: v_add_u32_e64 v4, v1, s34
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v3
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX9-O0-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4
|
||||
|
||||
@ -49,7 +49,7 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v4
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s5, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 2
|
||||
@ -197,7 +197,7 @@ define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 2
|
||||
@ -330,12 +330,20 @@ define amdgpu_kernel void @call(ptr addrspace(8) %tmp14, i32 %arg) {
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s2, 36
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], s2 offset:0x8
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s3, s9
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
|
||||
@ -555,12 +563,20 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 36
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], s8 offset:0x8
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s2
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s19
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s18
|
||||
; GFX9-O0-NEXT: s_mov_b32 s15, s17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
|
||||
@ -591,9 +607,10 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s2, 32
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60
|
||||
; GFX9-O0-NEXT: s_mov_b32 s2, s0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s0, s1
|
||||
@ -616,8 +633,8 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) %tmp14, i64 %arg) {
|
||||
; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr15
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4
|
||||
@ -723,8 +740,11 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, -1
|
||||
@ -733,7 +753,10 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s11
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, s10
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
|
||||
@ -742,12 +765,18 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
|
||||
@ -879,7 +908,7 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v3, v4
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s5, v3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 2
|
||||
@ -1027,7 +1056,7 @@ define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
|
||||
; GFX9-O0-NEXT: buffer_load_dword v3, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v3
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 1
|
||||
; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s4, v0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s4, 2
|
||||
@ -1160,12 +1189,20 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) %tmp14, i32 %arg) {
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v0
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s2, 36
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[8:9], s[0:1], s2 offset:0x8
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s3, s9
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_load_dword s2, s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
|
||||
@ -1385,12 +1422,20 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[2:3]
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, 36
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], s8 offset:0x8
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s3
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s2
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c
|
||||
; GFX9-O0-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s19
|
||||
; GFX9-O0-NEXT: s_mov_b32 s9, s18
|
||||
; GFX9-O0-NEXT: s_mov_b32 s15, s17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
|
||||
; GFX9-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
|
||||
@ -1421,9 +1466,10 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[2:3]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9
|
||||
; GFX9-O0-NEXT: s_mov_b32 s2, 32
|
||||
; GFX9-O0-NEXT: v_lshrrev_b64 v[11:12], s2, v[9:10]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11
|
||||
; GFX9-O0-NEXT: s_mov_b64 s[8:9], 60
|
||||
; GFX9-O0-NEXT: s_mov_b32 s2, s0
|
||||
; GFX9-O0-NEXT: s_mov_b32 s0, s1
|
||||
@ -1446,8 +1492,8 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) %tmp14, i64 %arg
|
||||
; GFX9-O0-NEXT: v_or3_b32 v3, v5, v4, v3
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr15
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
|
||||
; GFX9-O0-NEXT: v_readlane_b32 s0, v8, 4
|
||||
@ -1553,8 +1599,11 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx4 v[11:14], v0, s[0:3], s4 offen
|
||||
; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], v0, s[0:3], s4 offen offset:16
|
||||
; GFX9-O0-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, 0x7fffffff
|
||||
; GFX9-O0-NEXT: s_mov_b32 s10, -1
|
||||
@ -1563,7 +1612,10 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
|
||||
; GFX9-O0-NEXT: s_mov_b32 s8, s11
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr12_sgpr13
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: s_mov_b32 s5, s10
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
|
||||
@ -1572,12 +1624,18 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v14
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v7, v8
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7]
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
|
||||
; GFX9-O0-NEXT: s_mov_b64 exec, s[6:7]
|
||||
; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
|
||||
; GFX9-O0-NEXT: s_or_saveexec_b64 s[6:7], -1
|
||||
; GFX9-O0-NEXT: ; implicit-def: $sgpr10_sgpr11
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[6:7]
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
; RUN: llc -O0 -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_128:
|
||||
; GCN-NOT: and_b32
|
||||
; O2-NOT: and_b32
|
||||
define amdgpu_kernel void @zext_grp_size_128(ptr addrspace(1) nocapture %arg) #0 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -20,7 +20,7 @@ bb:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_32x4x1:
|
||||
; GCN-NOT: and_b32
|
||||
; O2-NOT: and_b32
|
||||
define amdgpu_kernel void @zext_grp_size_32x4x1(ptr addrspace(1) nocapture %arg) #0 !reqd_work_group_size !0 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -38,7 +38,7 @@ bb:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_1x1x1:
|
||||
; GCN-NOT: and_b32
|
||||
; O2-NOT: and_b32
|
||||
|
||||
; When EarlyCSE is not run this call produces a range max with 0 active bits,
|
||||
; which is a special case as an AssertZext from width 0 is invalid.
|
||||
@ -50,7 +50,7 @@ define amdgpu_kernel void @zext_grp_size_1x1x1(ptr addrspace(1) nocapture %arg)
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}zext_grp_size_512:
|
||||
; GCN-NOT: and_b32
|
||||
; O2-NOT: and_b32
|
||||
define amdgpu_kernel void @zext_grp_size_512(ptr addrspace(1) nocapture %arg) #1 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user