Alex MacLean 76c9bfefa4
[NVPTX] Remove Float register classes (#140487)
These classes are redundant, as the untyped "Int" classes can be used
for all float operations. This change is intended to be as minimal as
possible and leaves the many potential simplifications and refactors
this exposes as future work.
2025-05-21 11:33:57 -07:00

117 lines
4.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}
target triple = "nvptx64-unknown-cuda"
;; Make sure we are generating proper instruction sequences for fused ops
;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
;; add.f32 otherwise. Without an explicit rounding mode on add.f32, ptxas
;; is free to fuse with a multiply if it is able. If fusion is not allowed,
;; we do not form fma.rn at the PTX level and explicitly generate add.rn
;; for all adds to prevent ptxas from fusion the ops.
define float @t0(float %a, float %b, float %c) {
; FAST-LABEL: t0(
; FAST: {
; FAST-NEXT: .reg .b32 %r<5>;
; FAST-EMPTY:
; FAST-NEXT: // %bb.0:
; FAST-NEXT: ld.param.b32 %r1, [t0_param_0];
; FAST-NEXT: ld.param.b32 %r2, [t0_param_1];
; FAST-NEXT: ld.param.b32 %r3, [t0_param_2];
; FAST-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3;
; FAST-NEXT: st.param.b32 [func_retval0], %r4;
; FAST-NEXT: ret;
;
; DEFAULT-LABEL: t0(
; DEFAULT: {
; DEFAULT-NEXT: .reg .b32 %r<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT: // %bb.0:
; DEFAULT-NEXT: ld.param.b32 %r1, [t0_param_0];
; DEFAULT-NEXT: ld.param.b32 %r2, [t0_param_1];
; DEFAULT-NEXT: mul.rn.f32 %r3, %r1, %r2;
; DEFAULT-NEXT: ld.param.b32 %r4, [t0_param_2];
; DEFAULT-NEXT: add.rn.f32 %r5, %r3, %r4;
; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5;
; DEFAULT-NEXT: ret;
%v0 = fmul float %a, %b
%v1 = fadd float %v0, %c
ret float %v1
}
;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
;; to prevent ptxas from fusing this with anything else.
define float @t1(float %a, float %b) {
; FAST-LABEL: t1(
; FAST: {
; FAST-NEXT: .reg .b32 %r<6>;
; FAST-EMPTY:
; FAST-NEXT: // %bb.0:
; FAST-NEXT: ld.param.b32 %r1, [t1_param_0];
; FAST-NEXT: ld.param.b32 %r2, [t1_param_1];
; FAST-NEXT: add.f32 %r3, %r1, %r2;
; FAST-NEXT: sub.f32 %r4, %r1, %r2;
; FAST-NEXT: mul.f32 %r5, %r3, %r4;
; FAST-NEXT: st.param.b32 [func_retval0], %r5;
; FAST-NEXT: ret;
;
; DEFAULT-LABEL: t1(
; DEFAULT: {
; DEFAULT-NEXT: .reg .b32 %r<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT: // %bb.0:
; DEFAULT-NEXT: ld.param.b32 %r1, [t1_param_0];
; DEFAULT-NEXT: ld.param.b32 %r2, [t1_param_1];
; DEFAULT-NEXT: add.rn.f32 %r3, %r1, %r2;
; DEFAULT-NEXT: sub.rn.f32 %r4, %r1, %r2;
; DEFAULT-NEXT: mul.rn.f32 %r5, %r3, %r4;
; DEFAULT-NEXT: st.param.b32 [func_retval0], %r5;
; DEFAULT-NEXT: ret;
%v1 = fadd float %a, %b
%v2 = fsub float %a, %b
%v3 = fmul float %v1, %v2
ret float %v3
}
;; Make sure we generate the non ".rn" version when the "contract" flag is
;; present on the instructions
define float @t2(float %a, float %b) {
; CHECK-LABEL: t2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [t2_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [t2_param_1];
; CHECK-NEXT: add.f32 %r3, %r1, %r2;
; CHECK-NEXT: sub.f32 %r4, %r1, %r2;
; CHECK-NEXT: mul.f32 %r5, %r3, %r4;
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
; CHECK-NEXT: ret;
%v1 = fadd contract float %a, %b
%v2 = fsub contract float %a, %b
%v3 = fmul contract float %v1, %v2
ret float %v3
}
;; Make sure we always fold to fma when the "contract" flag is present
define float @t3(float %a, float %b, float %c) {
; CHECK-LABEL: t3(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b32 %r1, [t3_param_0];
; CHECK-NEXT: ld.param.b32 %r2, [t3_param_1];
; CHECK-NEXT: ld.param.b32 %r3, [t3_param_2];
; CHECK-NEXT: fma.rn.f32 %r4, %r1, %r2, %r3;
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
; CHECK-NEXT: ret;
%v0 = fmul contract float %a, %b
%v1 = fadd contract float %v0, %c
ret float %v1
}