; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,FAST
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_30 | FileCheck %s --check-prefixes=CHECK,DEFAULT
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 -fp-contract=fast | %ptxas-verify %}
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_30 | %ptxas-verify %}

target triple = "nvptx64-unknown-cuda"

;; Make sure we are generating proper instruction sequences for fused ops
;; If fusion is allowed, we try to form fma.rn at the PTX level, and emit
;; add.f32 otherwise.  Without an explicit rounding mode on add.f32, ptxas
;; is free to fuse with a multiply if it is able.  If fusion is not allowed,
;; we do not form fma.rn at the PTX level and explicitly generate add.rn
;; for all adds to prevent ptxas from fusion the ops.
define float @t0(float %a, float %b, float %c) {
; FAST-LABEL: t0(
; FAST:       {
; FAST-NEXT:    .reg .b32 %r<5>;
; FAST-EMPTY:
; FAST-NEXT:  // %bb.0:
; FAST-NEXT:    ld.param.b32 %r1, [t0_param_0];
; FAST-NEXT:    ld.param.b32 %r2, [t0_param_1];
; FAST-NEXT:    ld.param.b32 %r3, [t0_param_2];
; FAST-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
; FAST-NEXT:    ret;
;
; DEFAULT-LABEL: t0(
; DEFAULT:       {
; DEFAULT-NEXT:    .reg .b32 %r<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT:  // %bb.0:
; DEFAULT-NEXT:    ld.param.b32 %r1, [t0_param_0];
; DEFAULT-NEXT:    ld.param.b32 %r2, [t0_param_1];
; DEFAULT-NEXT:    mul.rn.f32 %r3, %r1, %r2;
; DEFAULT-NEXT:    ld.param.b32 %r4, [t0_param_2];
; DEFAULT-NEXT:    add.rn.f32 %r5, %r3, %r4;
; DEFAULT-NEXT:    st.param.b32 [func_retval0], %r5;
; DEFAULT-NEXT:    ret;
  %v0 = fmul float %a, %b
  %v1 = fadd float %v0, %c
  ret float %v1
}

;; We cannot form an fma here, but make sure we explicitly emit add.rn.f32
;; to prevent ptxas from fusing this with anything else.
define float @t1(float %a, float %b) {
; FAST-LABEL: t1(
; FAST:       {
; FAST-NEXT:    .reg .b32 %r<6>;
; FAST-EMPTY:
; FAST-NEXT:  // %bb.0:
; FAST-NEXT:    ld.param.b32 %r1, [t1_param_0];
; FAST-NEXT:    ld.param.b32 %r2, [t1_param_1];
; FAST-NEXT:    add.f32 %r3, %r1, %r2;
; FAST-NEXT:    sub.f32 %r4, %r1, %r2;
; FAST-NEXT:    mul.f32 %r5, %r3, %r4;
; FAST-NEXT:    st.param.b32 [func_retval0], %r5;
; FAST-NEXT:    ret;
;
; DEFAULT-LABEL: t1(
; DEFAULT:       {
; DEFAULT-NEXT:    .reg .b32 %r<6>;
; DEFAULT-EMPTY:
; DEFAULT-NEXT:  // %bb.0:
; DEFAULT-NEXT:    ld.param.b32 %r1, [t1_param_0];
; DEFAULT-NEXT:    ld.param.b32 %r2, [t1_param_1];
; DEFAULT-NEXT:    add.rn.f32 %r3, %r1, %r2;
; DEFAULT-NEXT:    sub.rn.f32 %r4, %r1, %r2;
; DEFAULT-NEXT:    mul.rn.f32 %r5, %r3, %r4;
; DEFAULT-NEXT:    st.param.b32 [func_retval0], %r5;
; DEFAULT-NEXT:    ret;
  %v1 = fadd float %a, %b
  %v2 = fsub float %a, %b
  %v3 = fmul float %v1, %v2
  ret float %v3
}

;; Make sure we generate the non ".rn" version when the "contract" flag is
;; present on the instructions
define float @t2(float %a, float %b) {
; CHECK-LABEL: t2(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<6>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [t2_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [t2_param_1];
; CHECK-NEXT:    add.f32 %r3, %r1, %r2;
; CHECK-NEXT:    sub.f32 %r4, %r1, %r2;
; CHECK-NEXT:    mul.f32 %r5, %r3, %r4;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r5;
; CHECK-NEXT:    ret;
  %v1 = fadd contract float %a, %b
  %v2 = fsub contract float %a, %b
  %v3 = fmul contract float %v1, %v2
  ret float %v3
}

;; Make sure we always fold to fma when the "contract" flag is present
define float @t3(float %a, float %b, float %c) {
; CHECK-LABEL: t3(
; CHECK:       {
; CHECK-NEXT:    .reg .b32 %r<5>;
; CHECK-EMPTY:
; CHECK-NEXT:  // %bb.0:
; CHECK-NEXT:    ld.param.b32 %r1, [t3_param_0];
; CHECK-NEXT:    ld.param.b32 %r2, [t3_param_1];
; CHECK-NEXT:    ld.param.b32 %r3, [t3_param_2];
; CHECK-NEXT:    fma.rn.f32 %r4, %r1, %r2, %r3;
; CHECK-NEXT:    st.param.b32 [func_retval0], %r4;
; CHECK-NEXT:    ret;
  %v0 = fmul contract float %a, %b
  %v1 = fadd contract float %v0, %c
  ret float %v1
}