
Lower `fadd`, `fsub`, `fmul`, and `fma` to f32x2 variants introduced in PTX 8.6 for sm_100+. Adds a new register class for v2f32 as a b64 register in PTX. This causes other vector operations like loads and stores to lower as .b64 instead of .v2.b32 as appropriate. Also update test cases to use the autogenerator.
61 lines
1.9 KiB
LLVM
61 lines
1.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
|
|
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
|
|
|
|
define float @foo(<2 x float> %a) {
|
|
; CHECK-LABEL: foo(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<6>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [foo_param_0];
|
|
; CHECK-NEXT: mul.rn.f32 %r3, %r2, %r2;
|
|
; CHECK-NEXT: mul.rn.f32 %r4, %r1, %r1;
|
|
; CHECK-NEXT: add.rn.f32 %r5, %r4, %r3;
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
|
|
; CHECK-NEXT: ret;
|
|
%t1 = fmul <2 x float> %a, %a
|
|
%t2 = extractelement <2 x float> %t1, i32 0
|
|
%t3 = extractelement <2 x float> %t1, i32 1
|
|
%t4 = fadd float %t2, %t3
|
|
ret float %t4
|
|
}
|
|
|
|
|
|
define float @bar(<4 x float> %a) {
|
|
; CHECK-LABEL: bar(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<8>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [bar_param_0];
|
|
; CHECK-NEXT: mul.rn.f32 %r5, %r2, %r2;
|
|
; CHECK-NEXT: mul.rn.f32 %r6, %r1, %r1;
|
|
; CHECK-NEXT: add.rn.f32 %r7, %r6, %r5;
|
|
; CHECK-NEXT: st.param.b32 [func_retval0], %r7;
|
|
; CHECK-NEXT: ret;
|
|
%t1 = fmul <4 x float> %a, %a
|
|
%t2 = extractelement <4 x float> %t1, i32 0
|
|
%t3 = extractelement <4 x float> %t1, i32 1
|
|
%t4 = fadd float %t2, %t3
|
|
ret float %t4
|
|
}
|
|
|
|
|
|
define <4 x float> @baz(<4 x float> %a) {
|
|
; CHECK-LABEL: baz(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<9>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [baz_param_0];
|
|
; CHECK-NEXT: mul.rn.f32 %r5, %r4, %r4;
|
|
; CHECK-NEXT: mul.rn.f32 %r6, %r3, %r3;
|
|
; CHECK-NEXT: mul.rn.f32 %r7, %r2, %r2;
|
|
; CHECK-NEXT: mul.rn.f32 %r8, %r1, %r1;
|
|
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r8, %r7, %r6, %r5};
|
|
; CHECK-NEXT: ret;
|
|
%t1 = fmul <4 x float> %a, %a
|
|
ret <4 x float> %t1
|
|
}
|