
Lower `fadd`, `fsub`, `fmul`, and `fma` to f32x2 variants introduced in PTX 8.6 for sm_100+. Adds a new register class for v2f32 as a b64 register in PTX. This causes other vector operations like loads and stores to lower as .b64 instead of .v2.b32 as appropriate. Also update test cases to use the autogenerator.
280 lines
9.3 KiB
LLVM
280 lines
9.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
|
|
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
|
|
target triple = "nvptx64-nvidia-cuda"
|
|
|
|
define <4 x float> @t1(ptr %p1) {
|
|
; CHECK-LABEL: t1(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<46>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0];
|
|
; CHECK-NEXT: ld.b8 %rd2, [%rd1+8];
|
|
; CHECK-NEXT: ld.b8 %rd3, [%rd1+9];
|
|
; CHECK-NEXT: shl.b64 %rd4, %rd3, 8;
|
|
; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2;
|
|
; CHECK-NEXT: ld.b8 %rd6, [%rd1+10];
|
|
; CHECK-NEXT: shl.b64 %rd7, %rd6, 16;
|
|
; CHECK-NEXT: ld.b8 %rd8, [%rd1+11];
|
|
; CHECK-NEXT: shl.b64 %rd9, %rd8, 24;
|
|
; CHECK-NEXT: or.b64 %rd10, %rd9, %rd7;
|
|
; CHECK-NEXT: or.b64 %rd11, %rd10, %rd5;
|
|
; CHECK-NEXT: ld.b8 %rd12, [%rd1+12];
|
|
; CHECK-NEXT: ld.b8 %rd13, [%rd1+13];
|
|
; CHECK-NEXT: shl.b64 %rd14, %rd13, 8;
|
|
; CHECK-NEXT: or.b64 %rd15, %rd14, %rd12;
|
|
; CHECK-NEXT: ld.b8 %rd16, [%rd1+14];
|
|
; CHECK-NEXT: shl.b64 %rd17, %rd16, 16;
|
|
; CHECK-NEXT: ld.b8 %rd18, [%rd1+15];
|
|
; CHECK-NEXT: shl.b64 %rd19, %rd18, 24;
|
|
; CHECK-NEXT: or.b64 %rd20, %rd19, %rd17;
|
|
; CHECK-NEXT: or.b64 %rd21, %rd20, %rd15;
|
|
; CHECK-NEXT: shl.b64 %rd22, %rd21, 32;
|
|
; CHECK-NEXT: or.b64 %rd23, %rd22, %rd11;
|
|
; CHECK-NEXT: ld.b8 %rd24, [%rd1];
|
|
; CHECK-NEXT: ld.b8 %rd25, [%rd1+1];
|
|
; CHECK-NEXT: shl.b64 %rd26, %rd25, 8;
|
|
; CHECK-NEXT: or.b64 %rd27, %rd26, %rd24;
|
|
; CHECK-NEXT: ld.b8 %rd28, [%rd1+2];
|
|
; CHECK-NEXT: shl.b64 %rd29, %rd28, 16;
|
|
; CHECK-NEXT: ld.b8 %rd30, [%rd1+3];
|
|
; CHECK-NEXT: shl.b64 %rd31, %rd30, 24;
|
|
; CHECK-NEXT: or.b64 %rd32, %rd31, %rd29;
|
|
; CHECK-NEXT: or.b64 %rd33, %rd32, %rd27;
|
|
; CHECK-NEXT: ld.b8 %rd34, [%rd1+4];
|
|
; CHECK-NEXT: ld.b8 %rd35, [%rd1+5];
|
|
; CHECK-NEXT: shl.b64 %rd36, %rd35, 8;
|
|
; CHECK-NEXT: or.b64 %rd37, %rd36, %rd34;
|
|
; CHECK-NEXT: ld.b8 %rd38, [%rd1+6];
|
|
; CHECK-NEXT: shl.b64 %rd39, %rd38, 16;
|
|
; CHECK-NEXT: ld.b8 %rd40, [%rd1+7];
|
|
; CHECK-NEXT: shl.b64 %rd41, %rd40, 24;
|
|
; CHECK-NEXT: or.b64 %rd42, %rd41, %rd39;
|
|
; CHECK-NEXT: or.b64 %rd43, %rd42, %rd37;
|
|
; CHECK-NEXT: shl.b64 %rd44, %rd43, 32;
|
|
; CHECK-NEXT: or.b64 %rd45, %rd44, %rd33;
|
|
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd45, %rd23};
|
|
; CHECK-NEXT: ret;
|
|
%r = load <4 x float>, ptr %p1, align 1
|
|
ret <4 x float> %r
|
|
}
|
|
|
|
define <4 x float> @t2(ptr %p1) {
|
|
; CHECK-LABEL: t2(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<10>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0];
|
|
; CHECK-NEXT: ld.b32 %rd2, [%rd1+8];
|
|
; CHECK-NEXT: ld.b32 %rd3, [%rd1+12];
|
|
; CHECK-NEXT: shl.b64 %rd4, %rd3, 32;
|
|
; CHECK-NEXT: or.b64 %rd5, %rd4, %rd2;
|
|
; CHECK-NEXT: ld.b32 %rd6, [%rd1];
|
|
; CHECK-NEXT: ld.b32 %rd7, [%rd1+4];
|
|
; CHECK-NEXT: shl.b64 %rd8, %rd7, 32;
|
|
; CHECK-NEXT: or.b64 %rd9, %rd8, %rd6;
|
|
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd9, %rd5};
|
|
; CHECK-NEXT: ret;
|
|
%r = load <4 x float>, ptr %p1, align 4
|
|
ret <4 x float> %r
|
|
}
|
|
|
|
define <4 x float> @t3(ptr %p1) {
|
|
; CHECK-LABEL: t3(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<4>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0];
|
|
; CHECK-NEXT: ld.b64 %rd2, [%rd1+8];
|
|
; CHECK-NEXT: ld.b64 %rd3, [%rd1];
|
|
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd2};
|
|
; CHECK-NEXT: ret;
|
|
%r = load <4 x float>, ptr %p1, align 8
|
|
ret <4 x float> %r
|
|
}
|
|
|
|
define <4 x float> @t4(ptr %p1) {
|
|
; CHECK-LABEL: t4(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<4>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0];
|
|
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
|
|
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd2, %rd3};
|
|
; CHECK-NEXT: ret;
|
|
%r = load <4 x float>, ptr %p1, align 16
|
|
ret <4 x float> %r
|
|
}
|
|
|
|
define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) {
|
|
; CHECK-LABEL: test_v1halfp0a1(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b16 %rs<3>;
|
|
; CHECK-NEXT: .reg .b64 %rd<3>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [test_v1halfp0a1_param_0];
|
|
; CHECK-NEXT: ld.b8 %rs1, [%rd1];
|
|
; CHECK-NEXT: ld.b8 %rs2, [%rd1+1];
|
|
; CHECK-NEXT: ld.param.b64 %rd2, [test_v1halfp0a1_param_1];
|
|
; CHECK-NEXT: st.b8 [%rd2+1], %rs2;
|
|
; CHECK-NEXT: st.b8 [%rd2], %rs1;
|
|
; CHECK-NEXT: ret;
|
|
%1 = load <1 x half>, ptr %from , align 1
|
|
store <1 x half> %1, ptr %to , align 1
|
|
ret void
|
|
}
|
|
|
|
define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) {
|
|
; CHECK-LABEL: test_v2halfp0a1(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<5>;
|
|
; CHECK-NEXT: .reg .b64 %rd<3>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2halfp0a1_param_0];
|
|
; CHECK-NEXT: ld.b8 %r1, [%rd1+1];
|
|
; CHECK-NEXT: ld.b8 %r2, [%rd1];
|
|
; CHECK-NEXT: ld.b8 %r3, [%rd1+3];
|
|
; CHECK-NEXT: ld.b8 %r4, [%rd1+2];
|
|
; CHECK-NEXT: ld.param.b64 %rd2, [test_v2halfp0a1_param_1];
|
|
; CHECK-NEXT: st.b8 [%rd2+2], %r4;
|
|
; CHECK-NEXT: st.b8 [%rd2+3], %r3;
|
|
; CHECK-NEXT: st.b8 [%rd2], %r2;
|
|
; CHECK-NEXT: st.b8 [%rd2+1], %r1;
|
|
; CHECK-NEXT: ret;
|
|
%1 = load <2 x half>, ptr %from , align 1
|
|
store <2 x half> %1, ptr %to , align 1
|
|
ret void
|
|
}
|
|
|
|
define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
|
|
; CHECK-LABEL: test_v4halfp0a1(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b32 %r<9>;
|
|
; CHECK-NEXT: .reg .b64 %rd<3>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [test_v4halfp0a1_param_0];
|
|
; CHECK-NEXT: ld.b8 %r1, [%rd1+1];
|
|
; CHECK-NEXT: ld.b8 %r2, [%rd1];
|
|
; CHECK-NEXT: ld.b8 %r3, [%rd1+3];
|
|
; CHECK-NEXT: ld.b8 %r4, [%rd1+2];
|
|
; CHECK-NEXT: ld.b8 %r5, [%rd1+5];
|
|
; CHECK-NEXT: ld.b8 %r6, [%rd1+4];
|
|
; CHECK-NEXT: ld.b8 %r7, [%rd1+7];
|
|
; CHECK-NEXT: ld.b8 %r8, [%rd1+6];
|
|
; CHECK-NEXT: ld.param.b64 %rd2, [test_v4halfp0a1_param_1];
|
|
; CHECK-NEXT: st.b8 [%rd2+6], %r8;
|
|
; CHECK-NEXT: st.b8 [%rd2+7], %r7;
|
|
; CHECK-NEXT: st.b8 [%rd2+4], %r6;
|
|
; CHECK-NEXT: st.b8 [%rd2+5], %r5;
|
|
; CHECK-NEXT: st.b8 [%rd2+2], %r4;
|
|
; CHECK-NEXT: st.b8 [%rd2+3], %r3;
|
|
; CHECK-NEXT: st.b8 [%rd2], %r2;
|
|
; CHECK-NEXT: st.b8 [%rd2+1], %r1;
|
|
; CHECK-NEXT: ret;
|
|
%1 = load <4 x half>, ptr %from , align 1
|
|
store <4 x half> %1, ptr %to , align 1
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @s1(ptr %p1, <4 x float> %v) {
|
|
; CHECK-LABEL: s1(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<18>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0];
|
|
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s1_param_1];
|
|
; CHECK-NEXT: st.b8 [%rd1+8], %rd3;
|
|
; CHECK-NEXT: st.b8 [%rd1], %rd2;
|
|
; CHECK-NEXT: shr.u64 %rd4, %rd3, 56;
|
|
; CHECK-NEXT: st.b8 [%rd1+15], %rd4;
|
|
; CHECK-NEXT: shr.u64 %rd5, %rd3, 48;
|
|
; CHECK-NEXT: st.b8 [%rd1+14], %rd5;
|
|
; CHECK-NEXT: shr.u64 %rd6, %rd3, 40;
|
|
; CHECK-NEXT: st.b8 [%rd1+13], %rd6;
|
|
; CHECK-NEXT: shr.u64 %rd7, %rd3, 32;
|
|
; CHECK-NEXT: st.b8 [%rd1+12], %rd7;
|
|
; CHECK-NEXT: shr.u64 %rd8, %rd3, 24;
|
|
; CHECK-NEXT: st.b8 [%rd1+11], %rd8;
|
|
; CHECK-NEXT: shr.u64 %rd9, %rd3, 16;
|
|
; CHECK-NEXT: st.b8 [%rd1+10], %rd9;
|
|
; CHECK-NEXT: shr.u64 %rd10, %rd3, 8;
|
|
; CHECK-NEXT: st.b8 [%rd1+9], %rd10;
|
|
; CHECK-NEXT: shr.u64 %rd11, %rd2, 56;
|
|
; CHECK-NEXT: st.b8 [%rd1+7], %rd11;
|
|
; CHECK-NEXT: shr.u64 %rd12, %rd2, 48;
|
|
; CHECK-NEXT: st.b8 [%rd1+6], %rd12;
|
|
; CHECK-NEXT: shr.u64 %rd13, %rd2, 40;
|
|
; CHECK-NEXT: st.b8 [%rd1+5], %rd13;
|
|
; CHECK-NEXT: shr.u64 %rd14, %rd2, 32;
|
|
; CHECK-NEXT: st.b8 [%rd1+4], %rd14;
|
|
; CHECK-NEXT: shr.u64 %rd15, %rd2, 24;
|
|
; CHECK-NEXT: st.b8 [%rd1+3], %rd15;
|
|
; CHECK-NEXT: shr.u64 %rd16, %rd2, 16;
|
|
; CHECK-NEXT: st.b8 [%rd1+2], %rd16;
|
|
; CHECK-NEXT: shr.u64 %rd17, %rd2, 8;
|
|
; CHECK-NEXT: st.b8 [%rd1+1], %rd17;
|
|
; CHECK-NEXT: ret;
|
|
store <4 x float> %v, ptr %p1, align 1
|
|
ret void
|
|
}
|
|
|
|
define void @s2(ptr %p1, <4 x float> %v) {
|
|
; CHECK-LABEL: s2(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<6>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0];
|
|
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s2_param_1];
|
|
; CHECK-NEXT: st.b32 [%rd1+8], %rd3;
|
|
; CHECK-NEXT: st.b32 [%rd1], %rd2;
|
|
; CHECK-NEXT: shr.u64 %rd4, %rd3, 32;
|
|
; CHECK-NEXT: st.b32 [%rd1+12], %rd4;
|
|
; CHECK-NEXT: shr.u64 %rd5, %rd2, 32;
|
|
; CHECK-NEXT: st.b32 [%rd1+4], %rd5;
|
|
; CHECK-NEXT: ret;
|
|
store <4 x float> %v, ptr %p1, align 4
|
|
ret void
|
|
}
|
|
|
|
define void @s3(ptr %p1, <4 x float> %v) {
|
|
; CHECK-LABEL: s3(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<4>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0];
|
|
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s3_param_1];
|
|
; CHECK-NEXT: st.b64 [%rd1+8], %rd3;
|
|
; CHECK-NEXT: st.b64 [%rd1], %rd2;
|
|
; CHECK-NEXT: ret;
|
|
store <4 x float> %v, ptr %p1, align 8
|
|
ret void
|
|
}
|
|
|
|
define void @s4(ptr %p1, <4 x float> %v) {
|
|
; CHECK-LABEL: s4(
|
|
; CHECK: {
|
|
; CHECK-NEXT: .reg .b64 %rd<4>;
|
|
; CHECK-EMPTY:
|
|
; CHECK-NEXT: // %bb.0:
|
|
; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0];
|
|
; CHECK-NEXT: ld.param.v2.b64 {%rd2, %rd3}, [s4_param_1];
|
|
; CHECK-NEXT: st.v2.b64 [%rd1], {%rd2, %rd3};
|
|
; CHECK-NEXT: ret;
|
|
store <4 x float> %v, ptr %p1, align 16
|
|
ret void
|
|
}
|
|
|