llvm-project/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll

276 lines
9.1 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
define <4 x float> @t1(ptr %p1) {
; CHECK-LABEL: t1(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<41>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t1_param_0];
; CHECK-NEXT: ld.b8 %r1, [%rd1+12];
; CHECK-NEXT: ld.b8 %r2, [%rd1+13];
; CHECK-NEXT: shl.b32 %r3, %r2, 8;
; CHECK-NEXT: or.b32 %r4, %r3, %r1;
; CHECK-NEXT: ld.b8 %r5, [%rd1+14];
; CHECK-NEXT: shl.b32 %r6, %r5, 16;
; CHECK-NEXT: ld.b8 %r7, [%rd1+15];
; CHECK-NEXT: shl.b32 %r8, %r7, 24;
; CHECK-NEXT: or.b32 %r9, %r8, %r6;
; CHECK-NEXT: or.b32 %r10, %r9, %r4;
; CHECK-NEXT: ld.b8 %r11, [%rd1+8];
; CHECK-NEXT: ld.b8 %r12, [%rd1+9];
; CHECK-NEXT: shl.b32 %r13, %r12, 8;
; CHECK-NEXT: or.b32 %r14, %r13, %r11;
; CHECK-NEXT: ld.b8 %r15, [%rd1+10];
; CHECK-NEXT: shl.b32 %r16, %r15, 16;
; CHECK-NEXT: ld.b8 %r17, [%rd1+11];
; CHECK-NEXT: shl.b32 %r18, %r17, 24;
; CHECK-NEXT: or.b32 %r19, %r18, %r16;
; CHECK-NEXT: or.b32 %r20, %r19, %r14;
; CHECK-NEXT: ld.b8 %r21, [%rd1+4];
; CHECK-NEXT: ld.b8 %r22, [%rd1+5];
; CHECK-NEXT: shl.b32 %r23, %r22, 8;
; CHECK-NEXT: or.b32 %r24, %r23, %r21;
; CHECK-NEXT: ld.b8 %r25, [%rd1+6];
; CHECK-NEXT: shl.b32 %r26, %r25, 16;
; CHECK-NEXT: ld.b8 %r27, [%rd1+7];
; CHECK-NEXT: shl.b32 %r28, %r27, 24;
; CHECK-NEXT: or.b32 %r29, %r28, %r26;
; CHECK-NEXT: or.b32 %r30, %r29, %r24;
; CHECK-NEXT: ld.b8 %r31, [%rd1];
; CHECK-NEXT: ld.b8 %r32, [%rd1+1];
; CHECK-NEXT: shl.b32 %r33, %r32, 8;
; CHECK-NEXT: or.b32 %r34, %r33, %r31;
; CHECK-NEXT: ld.b8 %r35, [%rd1+2];
; CHECK-NEXT: shl.b32 %r36, %r35, 16;
; CHECK-NEXT: ld.b8 %r37, [%rd1+3];
; CHECK-NEXT: shl.b32 %r38, %r37, 24;
; CHECK-NEXT: or.b32 %r39, %r38, %r36;
; CHECK-NEXT: or.b32 %r40, %r39, %r34;
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r40, %r30, %r20, %r10};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 1
ret <4 x float> %r
}
define <4 x float> @t2(ptr %p1) {
; CHECK-LABEL: t2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0];
; CHECK-NEXT: ld.b32 %r1, [%rd1+12];
; CHECK-NEXT: ld.b32 %r2, [%rd1+8];
; CHECK-NEXT: ld.b32 %r3, [%rd1+4];
; CHECK-NEXT: ld.b32 %r4, [%rd1];
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r4, %r3, %r2, %r1};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 4
ret <4 x float> %r
}
define <4 x float> @t3(ptr %p1) {
; CHECK-LABEL: t3(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t3_param_0];
; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1+8];
; CHECK-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1];
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r3, %r4, %r1, %r2};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 8
ret <4 x float> %r
}
define <4 x float> @t4(ptr %p1) {
; CHECK-LABEL: t4(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [t4_param_0];
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
%r = load <4 x float>, ptr %p1, align 16
ret <4 x float> %r
}
define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) {
; CHECK-LABEL: test_v1halfp0a1(
; CHECK: {
; CHECK-NEXT: .reg .b16 %rs<3>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_v1halfp0a1_param_0];
; CHECK-NEXT: ld.b8 %rs1, [%rd1];
; CHECK-NEXT: ld.b8 %rs2, [%rd1+1];
; CHECK-NEXT: ld.param.b64 %rd2, [test_v1halfp0a1_param_1];
; CHECK-NEXT: st.b8 [%rd2+1], %rs2;
; CHECK-NEXT: st.b8 [%rd2], %rs1;
; CHECK-NEXT: ret;
%1 = load <1 x half>, ptr %from , align 1
store <1 x half> %1, ptr %to , align 1
ret void
}
define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) {
; CHECK-LABEL: test_v2halfp0a1(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2halfp0a1_param_0];
; CHECK-NEXT: ld.b8 %r1, [%rd1+1];
; CHECK-NEXT: ld.b8 %r2, [%rd1];
; CHECK-NEXT: ld.b8 %r3, [%rd1+3];
; CHECK-NEXT: ld.b8 %r4, [%rd1+2];
; CHECK-NEXT: ld.param.b64 %rd2, [test_v2halfp0a1_param_1];
; CHECK-NEXT: st.b8 [%rd2+2], %r4;
; CHECK-NEXT: st.b8 [%rd2+3], %r3;
; CHECK-NEXT: st.b8 [%rd2], %r2;
; CHECK-NEXT: st.b8 [%rd2+1], %r1;
; CHECK-NEXT: ret;
%1 = load <2 x half>, ptr %from , align 1
store <2 x half> %1, ptr %to , align 1
ret void
}
define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) {
; CHECK-LABEL: test_v4halfp0a1(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<9>;
; CHECK-NEXT: .reg .b64 %rd<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [test_v4halfp0a1_param_0];
; CHECK-NEXT: ld.b8 %r1, [%rd1+1];
; CHECK-NEXT: ld.b8 %r2, [%rd1];
; CHECK-NEXT: ld.b8 %r3, [%rd1+3];
; CHECK-NEXT: ld.b8 %r4, [%rd1+2];
; CHECK-NEXT: ld.b8 %r5, [%rd1+5];
; CHECK-NEXT: ld.b8 %r6, [%rd1+4];
; CHECK-NEXT: ld.b8 %r7, [%rd1+7];
; CHECK-NEXT: ld.b8 %r8, [%rd1+6];
; CHECK-NEXT: ld.param.b64 %rd2, [test_v4halfp0a1_param_1];
; CHECK-NEXT: st.b8 [%rd2+6], %r8;
; CHECK-NEXT: st.b8 [%rd2+7], %r7;
; CHECK-NEXT: st.b8 [%rd2+4], %r6;
; CHECK-NEXT: st.b8 [%rd2+5], %r5;
; CHECK-NEXT: st.b8 [%rd2+2], %r4;
; CHECK-NEXT: st.b8 [%rd2+3], %r3;
; CHECK-NEXT: st.b8 [%rd2], %r2;
; CHECK-NEXT: st.b8 [%rd2+1], %r1;
; CHECK-NEXT: ret;
%1 = load <4 x half>, ptr %from , align 1
store <4 x half> %1, ptr %to , align 1
ret void
}
define void @s1(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s1(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<17>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s1_param_0];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s1_param_1];
; CHECK-NEXT: st.b8 [%rd1+12], %r4;
; CHECK-NEXT: st.b8 [%rd1+8], %r3;
; CHECK-NEXT: st.b8 [%rd1+4], %r2;
; CHECK-NEXT: st.b8 [%rd1], %r1;
; CHECK-NEXT: shr.u32 %r5, %r4, 24;
; CHECK-NEXT: st.b8 [%rd1+15], %r5;
; CHECK-NEXT: shr.u32 %r6, %r4, 16;
; CHECK-NEXT: st.b8 [%rd1+14], %r6;
; CHECK-NEXT: shr.u32 %r7, %r4, 8;
; CHECK-NEXT: st.b8 [%rd1+13], %r7;
; CHECK-NEXT: shr.u32 %r8, %r3, 24;
; CHECK-NEXT: st.b8 [%rd1+11], %r8;
; CHECK-NEXT: shr.u32 %r9, %r3, 16;
; CHECK-NEXT: st.b8 [%rd1+10], %r9;
; CHECK-NEXT: shr.u32 %r10, %r3, 8;
; CHECK-NEXT: st.b8 [%rd1+9], %r10;
; CHECK-NEXT: shr.u32 %r11, %r2, 24;
; CHECK-NEXT: st.b8 [%rd1+7], %r11;
; CHECK-NEXT: shr.u32 %r12, %r2, 16;
; CHECK-NEXT: st.b8 [%rd1+6], %r12;
; CHECK-NEXT: shr.u32 %r13, %r2, 8;
; CHECK-NEXT: st.b8 [%rd1+5], %r13;
; CHECK-NEXT: shr.u32 %r14, %r1, 24;
; CHECK-NEXT: st.b8 [%rd1+3], %r14;
; CHECK-NEXT: shr.u32 %r15, %r1, 16;
; CHECK-NEXT: st.b8 [%rd1+2], %r15;
; CHECK-NEXT: shr.u32 %r16, %r1, 8;
; CHECK-NEXT: st.b8 [%rd1+1], %r16;
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 1
ret void
}
define void @s2(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s2(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s2_param_1];
; CHECK-NEXT: st.b32 [%rd1+12], %r4;
; CHECK-NEXT: st.b32 [%rd1+8], %r3;
; CHECK-NEXT: st.b32 [%rd1+4], %r2;
; CHECK-NEXT: st.b32 [%rd1], %r1;
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 4
ret void
}
define void @s3(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s3(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s3_param_0];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s3_param_1];
; CHECK-NEXT: st.v2.b32 [%rd1+8], {%r3, %r4};
; CHECK-NEXT: st.v2.b32 [%rd1], {%r1, %r2};
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 8
ret void
}
define void @s4(ptr %p1, <4 x float> %v) {
; CHECK-LABEL: s4(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<5>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.b64 %rd1, [s4_param_0];
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [s4_param_1];
; CHECK-NEXT: st.v4.b32 [%rd1], {%r1, %r2, %r3, %r4};
; CHECK-NEXT: ret;
store <4 x float> %v, ptr %p1, align 16
ret void
}