llvm-project/llvm/test/CodeGen/AMDGPU/minimummaximum.ll
Brox Chen 9d7e1d92db
[AMDGPU][True16] added Pre-RA hint to improve copy elimination (#103366)
The allocation order of 16 bit registers is vgpr0lo16, vgpr0hi16,
vgpr1lo16, vgpr1hi16, vgpr2lo16.... We prefer (essentially require) that
allocation order, because it uses the minimum number of registers. But
when you have 16 bit data passing between 16 and 32 bit instructions you
get lots of COPY.

This patch teach the compiler that a COPY of a 16-bit value from a 32
bit register to a lo-half 16 bit register is free, to a hi-half 16 bit
register is not.

This might get improved to coalescing with additional cases, and perhaps
as an alternative to the RA hints. For now upstreaming this solution
first.
2025-03-12 16:12:58 -04:00

208 lines
8.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,SDAG,SDAG-FAKE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-TRUE16 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GISEL,GISEL-FAKE16 %s
define amdgpu_ps float @test_minmax_f32(float %a, float %b, float %c) {
; GFX12-LABEL: test_minmax_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_maximumminimum_f32 v0, v0, v1, v2
; GFX12-NEXT: ; return to shader part epilog
%max = call float @llvm.maximum.f32(float %a, float %b)
%minmax = call float @llvm.minimum.f32(float %max, float %c)
ret float %minmax
}
define amdgpu_ps void @s_test_minmax_f32(float inreg %a, float inreg %b, float inreg %c, ptr addrspace(1) inreg %out) {
; SDAG-LABEL: s_test_minmax_f32:
; SDAG: ; %bb.0:
; SDAG-NEXT: s_maximum_f32 s0, s0, s1
; SDAG-NEXT: s_mov_b32 s5, s4
; SDAG-NEXT: s_mov_b32 s4, s3
; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; SDAG-NEXT: s_minimum_f32 s0, s0, s2
; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; SDAG-NEXT: s_endpgm
;
; GISEL-LABEL: s_test_minmax_f32:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_maximum_f32 s0, s0, s1
; GISEL-NEXT: s_mov_b32 s6, s3
; GISEL-NEXT: s_mov_b32 s7, s4
; GISEL-NEXT: v_mov_b32_e32 v1, 0
; GISEL-NEXT: s_minimum_f32 s0, s0, s2
; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GISEL-NEXT: v_mov_b32_e32 v0, s0
; GISEL-NEXT: global_store_b32 v1, v0, s[6:7]
; GISEL-NEXT: s_endpgm
%smax = call float @llvm.maximum.f32(float %a, float %b)
%sminmax = call float @llvm.minimum.f32(float %smax, float %c)
store float %sminmax, ptr addrspace(1) %out
ret void
}
define amdgpu_ps float @test_minmax_commuted_f32(float %a, float %b, float %c) {
; GFX12-LABEL: test_minmax_commuted_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_maximumminimum_f32 v0, v0, v1, v2
; GFX12-NEXT: ; return to shader part epilog
%max = call float @llvm.maximum.f32(float %a, float %b)
%minmax = call float @llvm.minimum.f32(float %c, float %max)
ret float %minmax
}
define amdgpu_ps float @test_maxmin_f32(float %a, float %b, float %c) {
; GFX12-LABEL: test_maxmin_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_minimummaximum_f32 v0, v0, v1, v2
; GFX12-NEXT: ; return to shader part epilog
%min = call float @llvm.minimum.f32(float %a, float %b)
%maxmin = call float @llvm.maximum.f32(float %min, float %c)
ret float %maxmin
}
define amdgpu_ps float @test_maxmin_commuted_f32(float %a, float %b, float %c) {
; GFX12-LABEL: test_maxmin_commuted_f32:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_minimummaximum_f32 v0, v0, v1, v2
; GFX12-NEXT: ; return to shader part epilog
%min = call float @llvm.minimum.f32(float %a, float %b)
%maxmin = call float @llvm.maximum.f32(float %c, float %min)
ret float %maxmin
}
define amdgpu_ps half @test_minmax_f16(half %a, half %b, half %c) {
; SDAG-TRUE16-LABEL: test_minmax_f16:
; SDAG-TRUE16: ; %bb.0:
; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-FAKE16-LABEL: test_minmax_f16:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
; SDAG-FAKE16-NEXT: ; return to shader part epilog
;
; GISEL-TRUE16-LABEL: test_minmax_f16:
; GISEL-TRUE16: ; %bb.0:
; GISEL-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
; GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GISEL-FAKE16-LABEL: test_minmax_f16:
; GISEL-FAKE16: ; %bb.0:
; GISEL-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
; GISEL-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maximum.f16(half %a, half %b)
%minmax = call half @llvm.minimum.f16(half %max, half %c)
ret half %minmax
}
define amdgpu_ps half @test_minmax_commuted_f16(half %a, half %b, half %c) {
; SDAG-TRUE16-LABEL: test_minmax_commuted_f16:
; SDAG-TRUE16: ; %bb.0:
; SDAG-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-FAKE16-LABEL: test_minmax_commuted_f16:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
; SDAG-FAKE16-NEXT: ; return to shader part epilog
;
; GISEL-TRUE16-LABEL: test_minmax_commuted_f16:
; GISEL-TRUE16: ; %bb.0:
; GISEL-TRUE16-NEXT: v_maximumminimum_f16 v0.l, v0.l, v1.l, v2.l
; GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GISEL-FAKE16-LABEL: test_minmax_commuted_f16:
; GISEL-FAKE16: ; %bb.0:
; GISEL-FAKE16-NEXT: v_maximumminimum_f16 v0, v0, v1, v2
; GISEL-FAKE16-NEXT: ; return to shader part epilog
%max = call half @llvm.maximum.f16(half %a, half %b)
%minmax = call half @llvm.minimum.f16(half %c, half %max)
ret half %minmax
}
define amdgpu_ps half @test_maxmin_commuted_f16(half %a, half %b, half %c) {
; SDAG-TRUE16-LABEL: test_maxmin_commuted_f16:
; SDAG-TRUE16: ; %bb.0:
; SDAG-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l
; SDAG-TRUE16-NEXT: ; return to shader part epilog
;
; SDAG-FAKE16-LABEL: test_maxmin_commuted_f16:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: v_minimummaximum_f16 v0, v0, v1, v2
; SDAG-FAKE16-NEXT: ; return to shader part epilog
;
; GISEL-TRUE16-LABEL: test_maxmin_commuted_f16:
; GISEL-TRUE16: ; %bb.0:
; GISEL-TRUE16-NEXT: v_minimummaximum_f16 v0.l, v0.l, v1.l, v2.l
; GISEL-TRUE16-NEXT: ; return to shader part epilog
;
; GISEL-FAKE16-LABEL: test_maxmin_commuted_f16:
; GISEL-FAKE16: ; %bb.0:
; GISEL-FAKE16-NEXT: v_minimummaximum_f16 v0, v0, v1, v2
; GISEL-FAKE16-NEXT: ; return to shader part epilog
%min = call half @llvm.minimum.f16(half %a, half %b)
%maxmin = call half @llvm.maximum.f16(half %c, half %min)
ret half %maxmin
}
define amdgpu_ps void @s_test_minmax_f16(half inreg %a, half inreg %b, half inreg %c, ptr addrspace(1) inreg %out) {
; SDAG-TRUE16-LABEL: s_test_minmax_f16:
; SDAG-TRUE16: ; %bb.0:
; SDAG-TRUE16-NEXT: s_maximum_f16 s0, s0, s1
; SDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; SDAG-TRUE16-NEXT: s_mov_b32 s5, s4
; SDAG-TRUE16-NEXT: s_mov_b32 s4, s3
; SDAG-TRUE16-NEXT: s_minimum_f16 s0, s0, s2
; SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; SDAG-TRUE16-NEXT: global_store_b16 v1, v0, s[4:5]
; SDAG-TRUE16-NEXT: s_endpgm
;
; SDAG-FAKE16-LABEL: s_test_minmax_f16:
; SDAG-FAKE16: ; %bb.0:
; SDAG-FAKE16-NEXT: s_maximum_f16 s0, s0, s1
; SDAG-FAKE16-NEXT: s_mov_b32 s5, s4
; SDAG-FAKE16-NEXT: s_mov_b32 s4, s3
; SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
; SDAG-FAKE16-NEXT: s_minimum_f16 s0, s0, s2
; SDAG-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
; SDAG-FAKE16-NEXT: global_store_b16 v0, v1, s[4:5]
; SDAG-FAKE16-NEXT: s_endpgm
;
; GISEL-TRUE16-LABEL: s_test_minmax_f16:
; GISEL-TRUE16: ; %bb.0:
; GISEL-TRUE16-NEXT: s_maximum_f16 s0, s0, s1
; GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-TRUE16-NEXT: s_mov_b32 s6, s3
; GISEL-TRUE16-NEXT: s_mov_b32 s7, s4
; GISEL-TRUE16-NEXT: s_minimum_f16 s0, s0, s2
; GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GISEL-TRUE16-NEXT: global_store_b16 v1, v0, s[6:7]
; GISEL-TRUE16-NEXT: s_endpgm
;
; GISEL-FAKE16-LABEL: s_test_minmax_f16:
; GISEL-FAKE16: ; %bb.0:
; GISEL-FAKE16-NEXT: s_maximum_f16 s0, s0, s1
; GISEL-FAKE16-NEXT: s_mov_b32 s6, s3
; GISEL-FAKE16-NEXT: s_mov_b32 s7, s4
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0
; GISEL-FAKE16-NEXT: s_minimum_f16 s0, s0, s2
; GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_3)
; GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0
; GISEL-FAKE16-NEXT: global_store_b16 v1, v0, s[6:7]
; GISEL-FAKE16-NEXT: s_endpgm
%smax = call half @llvm.maximum.f16(half %a, half %b)
%sminmax = call half @llvm.minimum.f16(half %smax, half %c)
store half %sminmax, ptr addrspace(1) %out
ret void
}
declare half @llvm.minimum.f16(half, half)
declare half @llvm.maximum.f16(half, half)
declare float @llvm.minimum.f32(float, float)
declare float @llvm.maximum.f32(float, float)