llvm-project/llvm/test/CodeGen/NVPTX/vec-param-load.ll
Princeton Ferro 1fdbe69849
[NVPTX] support f32x2 instructions for sm_100+ (#126337)
Lower `fadd`, `fsub`, `fmul`, and `fma` to f32x2 variants introduced in
PTX 8.6 for sm_100+. Adds a new register class for v2f32 as a b64
register in PTX. This causes other vector operations like loads and
stores to lower as .b64 instead of .v2.b32 as appropriate.

Also update test cases to use the autogenerator.
2025-07-11 11:50:50 -07:00

110 lines
4.0 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
define <16 x float> @test_v16f32(<16 x float> %a) {
; CHECK-LABEL: test_v16f32(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v16f32_param_0];
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v16f32_param_0+16];
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v16f32_param_0+32];
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v16f32_param_0+48];
; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8};
; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6};
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
ret <16 x float> %a
}
define <8 x float> @test_v8f32(<8 x float> %a) {
; CHECK-LABEL: test_v8f32(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<5>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8f32_param_0];
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8f32_param_0+16];
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
ret <8 x float> %a
}
define <4 x float> @test_v4f32(<4 x float> %a) {
; CHECK-LABEL: test_v4f32(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v4f32_param_0];
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
ret <4 x float> %a
}
define <2 x float> @test_v2f32(<2 x float> %a) {
; CHECK-LABEL: test_v2f32(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2f32_param_0];
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
; CHECK-NEXT: ret;
ret <2 x float> %a
}
; Oddly shaped vectors should not load any extra elements.
define <3 x float> @test_v3f32(<3 x float> %a) {
; CHECK-LABEL: test_v3f32(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_v3f32_param_0];
; CHECK-NEXT: ld.param.b32 %r1, [test_v3f32_param_0+8];
; CHECK-NEXT: st.param.b32 [func_retval0+8], %r1;
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
; CHECK-NEXT: ret;
ret <3 x float> %a
}
define <8 x i64> @test_v8i64(<8 x i64> %a) {
; CHECK-LABEL: test_v8i64(
; CHECK: {
; CHECK-NEXT: .reg .b64 %rd<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_v8i64_param_0];
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_v8i64_param_0+16];
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [test_v8i64_param_0+32];
; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [test_v8i64_param_0+48];
; CHECK-NEXT: st.param.v2.b64 [func_retval0+48], {%rd7, %rd8};
; CHECK-NEXT: st.param.v2.b64 [func_retval0+32], {%rd5, %rd6};
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
; CHECK-NEXT: ret;
ret <8 x i64> %a
}
define <16 x i16> @test_v16i16(<16 x i16> %a) {
; CHECK-LABEL: test_v16i16(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_v16i16_param_0];
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_v16i16_param_0+16];
; CHECK-NEXT: st.param.v4.b32 [func_retval0+16], {%r5, %r6, %r7, %r8};
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
ret <16 x i16> %a
}