llvm-project/llvm/test/CodeGen/NVPTX/variadics-backend.ll
Alex MacLean d494eb0fa3
[NVPTX] Skip numbering unreferenced virtual registers (readability) (#154391)
When assigning numbers to registers, skip any with neither uses nor
defs. This is will not have any impact at all on the final SASS but it
makes for slightly more readable PTX. This change should also ensure
that future minor changes are less likely to cause noisy diffs in
register numbering.
2025-08-19 12:27:46 -07:00

381 lines
16 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 < %s | FileCheck %s --check-prefix=CHECK-PTX
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-- -mtriple=nvptx64 -mcpu=sm_52 -mattr=+ptx64 | %ptxas-verify %}
%struct.S1 = type { i32, i8, i64 }
%struct.S2 = type { i64, i64 }
@__const.bar.s1 = private unnamed_addr constant %struct.S1 { i32 1, i8 1, i64 1 }, align 8
@__const.qux.s = private unnamed_addr constant %struct.S2 { i64 1, i64 1 }, align 8
define dso_local i32 @variadics1(i32 noundef %first, ...) {
; CHECK-PTX-LABEL: variadics1(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .reg .b32 %r<11>;
; CHECK-PTX-NEXT: .reg .b64 %rd<17>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics1_param_0];
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics1_param_1];
; CHECK-PTX-NEXT: ld.b32 %r2, [%rd1];
; CHECK-PTX-NEXT: add.s32 %r3, %r1, %r2;
; CHECK-PTX-NEXT: ld.b32 %r4, [%rd1+4];
; CHECK-PTX-NEXT: add.s32 %r5, %r3, %r4;
; CHECK-PTX-NEXT: ld.b32 %r6, [%rd1+8];
; CHECK-PTX-NEXT: add.s32 %r7, %r5, %r6;
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 19;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
; CHECK-PTX-NEXT: ld.b64 %rd4, [%rd3];
; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r7;
; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4;
; CHECK-PTX-NEXT: cvt.u32.u64 %r8, %rd6;
; CHECK-PTX-NEXT: add.s64 %rd7, %rd3, 15;
; CHECK-PTX-NEXT: and.b64 %rd8, %rd7, -8;
; CHECK-PTX-NEXT: ld.b64 %rd9, [%rd8];
; CHECK-PTX-NEXT: cvt.rn.f64.s32 %rd10, %r8;
; CHECK-PTX-NEXT: add.rn.f64 %rd11, %rd10, %rd9;
; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r9, %rd11;
; CHECK-PTX-NEXT: add.s64 %rd12, %rd8, 15;
; CHECK-PTX-NEXT: and.b64 %rd13, %rd12, -8;
; CHECK-PTX-NEXT: ld.b64 %rd14, [%rd13];
; CHECK-PTX-NEXT: cvt.rn.f64.s32 %rd15, %r9;
; CHECK-PTX-NEXT: add.rn.f64 %rd16, %rd15, %rd14;
; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %rd16;
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r10;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
call void @llvm.va_start.p0(ptr %vlist)
%argp.cur = load ptr, ptr %vlist, align 8
%argp.next = getelementptr inbounds i8, ptr %argp.cur, i64 4
store ptr %argp.next, ptr %vlist, align 8
%0 = load i32, ptr %argp.cur, align 4
%add = add nsw i32 %first, %0
%argp.cur1 = load ptr, ptr %vlist, align 8
%argp.next2 = getelementptr inbounds i8, ptr %argp.cur1, i64 4
store ptr %argp.next2, ptr %vlist, align 8
%1 = load i32, ptr %argp.cur1, align 4
%add3 = add nsw i32 %add, %1
%argp.cur4 = load ptr, ptr %vlist, align 8
%argp.next5 = getelementptr inbounds i8, ptr %argp.cur4, i64 4
store ptr %argp.next5, ptr %vlist, align 8
%2 = load i32, ptr %argp.cur4, align 4
%add6 = add nsw i32 %add3, %2
%argp.cur7 = load ptr, ptr %vlist, align 8
%3 = getelementptr inbounds i8, ptr %argp.cur7, i32 7
%argp.cur7.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %3, i64 -8)
%argp.next8 = getelementptr inbounds i8, ptr %argp.cur7.aligned, i64 8
store ptr %argp.next8, ptr %vlist, align 8
%4 = load i64, ptr %argp.cur7.aligned, align 8
%conv = sext i32 %add6 to i64
%add9 = add nsw i64 %conv, %4
%conv10 = trunc i64 %add9 to i32
%argp.cur11 = load ptr, ptr %vlist, align 8
%5 = getelementptr inbounds i8, ptr %argp.cur11, i32 7
%argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %5, i64 -8)
%argp.next12 = getelementptr inbounds i8, ptr %argp.cur11.aligned, i64 8
store ptr %argp.next12, ptr %vlist, align 8
%6 = load double, ptr %argp.cur11.aligned, align 8
%conv13 = sitofp i32 %conv10 to double
%add14 = fadd double %conv13, %6
%conv15 = fptosi double %add14 to i32
%argp.cur16 = load ptr, ptr %vlist, align 8
%7 = getelementptr inbounds i8, ptr %argp.cur16, i32 7
%argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %7, i64 -8)
%argp.next17 = getelementptr inbounds i8, ptr %argp.cur16.aligned, i64 8
store ptr %argp.next17, ptr %vlist, align 8
%8 = load double, ptr %argp.cur16.aligned, align 8
%conv18 = sitofp i32 %conv15 to double
%add19 = fadd double %conv18, %8
%conv20 = fptosi double %add19 to i32
call void @llvm.va_end.p0(ptr %vlist)
ret i32 %conv20
}
declare void @llvm.va_start.p0(ptr)
declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
declare void @llvm.va_end.p0(ptr)
define dso_local i32 @foo() {
; CHECK-PTX-LABEL: foo(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot1[40];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: st.b64 [%SP], 4294967297;
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+24], 4607182418800017408;
; CHECK-PTX-NEXT: st.b64 [%SP+32], 4607182418800017408;
; CHECK-PTX-NEXT: { // callseq 0, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: .param .b64 param1;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: call.uni (retval0), variadics1, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 0
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:
%conv = sext i8 1 to i32
%conv1 = sext i16 1 to i32
%conv2 = fpext float 1.000000e+00 to double
%call = call i32 (i32, ...) @variadics1(i32 noundef 1, i32 noundef %conv, i32 noundef %conv1, i32 noundef 1, i64 noundef 1, double noundef %conv2, double noundef 1.000000e+00)
ret i32 %call
}
define dso_local i32 @variadics2(i32 noundef %first, ...) {
; CHECK-PTX-LABEL: variadics2(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .local .align 1 .b8 __local_depot2[3];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<4>;
; CHECK-PTX-NEXT: .reg .b32 %r<6>;
; CHECK-PTX-NEXT: .reg .b64 %rd<8>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot2;
; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics2_param_0];
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics2_param_1];
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
; CHECK-PTX-NEXT: add.s64 %rd3, %rd1, 7;
; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8;
; CHECK-PTX-NEXT: ld.b32 %r2, [%rd4];
; CHECK-PTX-NEXT: ld.s8 %r3, [%rd4+4];
; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd4+7];
; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs1;
; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd4+6];
; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs2;
; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd4+5];
; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs3;
; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4+8];
; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2;
; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3;
; CHECK-PTX-NEXT: cvt.u64.u32 %rd6, %r5;
; CHECK-PTX-NEXT: add.s64 %rd7, %rd6, %rd5;
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd7;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
%s1.sroa.3 = alloca [3 x i8], align 1
call void @llvm.va_start.p0(ptr %vlist)
%argp.cur = load ptr, ptr %vlist, align 8
%0 = getelementptr inbounds i8, ptr %argp.cur, i32 7
%argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8)
%argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16
store ptr %argp.next, ptr %vlist, align 8
%s1.sroa.0.0.copyload = load i32, ptr %argp.cur.aligned, align 8
%s1.sroa.2.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 4
%s1.sroa.2.0.copyload = load i8, ptr %s1.sroa.2.0.argp.cur.aligned.sroa_idx, align 4
%s1.sroa.3.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 5
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 %s1.sroa.3.0.argp.cur.aligned.sroa_idx, i64 3, i1 false)
%s1.sroa.31.0.argp.cur.aligned.sroa_idx = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8
%s1.sroa.31.0.copyload = load i64, ptr %s1.sroa.31.0.argp.cur.aligned.sroa_idx, align 8
%add = add nsw i32 %first, %s1.sroa.0.0.copyload
%conv = sext i8 %s1.sroa.2.0.copyload to i32
%add1 = add nsw i32 %add, %conv
%conv2 = sext i32 %add1 to i64
%add3 = add nsw i64 %conv2, %s1.sroa.31.0.copyload
%conv4 = trunc i64 %add3 to i32
call void @llvm.va_end.p0(ptr %vlist)
ret i32 %conv4
}
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
define dso_local i32 @bar() {
; CHECK-PTX-LABEL: bar(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot3[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<4>;
; CHECK-PTX-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-NEXT: .reg .b64 %rd<3>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot3;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7];
; CHECK-PTX-NEXT: st.local.b8 [%rd1+2], %rs1;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs2, [__const_$_bar_$_s1+6];
; CHECK-PTX-NEXT: st.local.b8 [%rd1+1], %rs2;
; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+5];
; CHECK-PTX-NEXT: st.local.b8 [%rd1], %rs3;
; CHECK-PTX-NEXT: st.b32 [%SP+8], 1;
; CHECK-PTX-NEXT: st.b8 [%SP+12], 1;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: { // callseq 1, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: .param .b64 param1;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: add.u64 %rd2, %SP, 8;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd2;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: call.uni (retval0), variadics2, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 1
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:
%s1.sroa.3 = alloca [3 x i8], align 1
%s1.sroa.0.0.copyload = load i32, ptr @__const.bar.s1, align 8
%s1.sroa.2.0.copyload = load i8, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 4), align 4
call void @llvm.memcpy.p0.p0.i64(ptr align 1 %s1.sroa.3, ptr align 1 getelementptr inbounds (i8, ptr @__const.bar.s1, i64 5), i64 3, i1 false)
%s1.sroa.31.0.copyload = load i64, ptr getelementptr inbounds (i8, ptr @__const.bar.s1, i64 8), align 8
%call = call i32 (i32, ...) @variadics2(i32 noundef 1, i32 %s1.sroa.0.0.copyload, i8 %s1.sroa.2.0.copyload, i64 %s1.sroa.31.0.copyload)
ret i32 %call
}
define dso_local i32 @variadics3(i32 noundef %first, ...) {
; CHECK-PTX-LABEL: variadics3(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .reg .b32 %r<8>;
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics3_param_1];
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 15;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -16;
; CHECK-PTX-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd3];
; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2;
; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3;
; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4;
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r7;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
call void @llvm.va_start.p0(ptr %vlist)
%argp.cur = load ptr, ptr %vlist, align 8
%0 = getelementptr inbounds i8, ptr %argp.cur, i32 15
%argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -16)
%argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 16
store ptr %argp.next, ptr %vlist, align 8
%1 = load <4 x i32>, ptr %argp.cur.aligned, align 16
call void @llvm.va_end.p0(ptr %vlist)
%2 = extractelement <4 x i32> %1, i64 0
%3 = extractelement <4 x i32> %1, i64 1
%add = add nsw i32 %2, %3
%4 = extractelement <4 x i32> %1, i64 2
%add1 = add nsw i32 %add, %4
%5 = extractelement <4 x i32> %1, i64 3
%add2 = add nsw i32 %add1, %5
ret i32 %add2
}
define dso_local i32 @baz() {
; CHECK-PTX-LABEL: baz(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .local .align 16 .b8 __local_depot5[16];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b32 %r<2>;
; CHECK-PTX-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot5;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: st.v4.b32 [%SP], {1, 1, 1, 1};
; CHECK-PTX-NEXT: { // callseq 2, 0
; CHECK-PTX-NEXT: .param .b32 param0;
; CHECK-PTX-NEXT: .param .b64 param1;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd1;
; CHECK-PTX-NEXT: st.param.b32 [param0], 1;
; CHECK-PTX-NEXT: call.uni (retval0), variadics3, (param0, param1);
; CHECK-PTX-NEXT: ld.param.b32 %r1, [retval0];
; CHECK-PTX-NEXT: } // callseq 2
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1;
; CHECK-PTX-NEXT: ret;
entry:
%call = call i32 (i32, ...) @variadics3(i32 noundef 1, <4 x i32> noundef <i32 1, i32 1, i32 1, i32 1>)
ret i32 %call
}
define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) {
; CHECK-PTX-LABEL: variadics4(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .reg .b64 %rd<9>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics4_param_1];
; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 7;
; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8;
; CHECK-PTX-NEXT: ld.b64 %rd4, [%rd3];
; CHECK-PTX-NEXT: ld.param.b64 %rd5, [variadics4_param_0];
; CHECK-PTX-NEXT: ld.param.b64 %rd6, [variadics4_param_0+8];
; CHECK-PTX-NEXT: add.s64 %rd7, %rd5, %rd6;
; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd4;
; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %rd8;
; CHECK-PTX-NEXT: ret;
entry:
%vlist = alloca ptr, align 8
call void @llvm.va_start.p0(ptr %vlist)
%argp.cur = load ptr, ptr %vlist, align 8
%0 = getelementptr inbounds i8, ptr %argp.cur, i32 7
%argp.cur.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %0, i64 -8)
%argp.next = getelementptr inbounds i8, ptr %argp.cur.aligned, i64 8
store ptr %argp.next, ptr %vlist, align 8
%1 = load i64, ptr %argp.cur.aligned, align 8
%x1 = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 0
%2 = load i64, ptr %x1, align 8
%y = getelementptr inbounds %struct.S2, ptr %first, i32 0, i32 1
%3 = load i64, ptr %y, align 8
%add = add nsw i64 %2, %3
%add2 = add nsw i64 %add, %1
%conv = trunc i64 %add2 to i32
call void @llvm.va_end.p0(ptr %vlist)
ret i32 %conv
}
define dso_local void @qux() {
; CHECK-PTX-LABEL: qux(
; CHECK-PTX: {
; CHECK-PTX-NEXT: .local .align 8 .b8 __local_depot7[24];
; CHECK-PTX-NEXT: .reg .b64 %SP;
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b64 %rd<7>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot7;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd1, %SPL, 0;
; CHECK-PTX-NEXT: ld.global.nc.b64 %rd2, [__const_$_qux_$_s+8];
; CHECK-PTX-NEXT: st.local.b64 [%rd1+8], %rd2;
; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s];
; CHECK-PTX-NEXT: st.local.b64 [%rd1], %rd3;
; CHECK-PTX-NEXT: st.b64 [%SP+16], 1;
; CHECK-PTX-NEXT: { // callseq 3, 0
; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16];
; CHECK-PTX-NEXT: .param .b64 param1;
; CHECK-PTX-NEXT: .param .b32 retval0;
; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 16;
; CHECK-PTX-NEXT: st.param.b64 [param1], %rd4;
; CHECK-PTX-NEXT: ld.local.b64 %rd5, [%rd1+8];
; CHECK-PTX-NEXT: st.param.b64 [param0+8], %rd5;
; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd1];
; CHECK-PTX-NEXT: st.param.b64 [param0], %rd6;
; CHECK-PTX-NEXT: call.uni (retval0), variadics4, (param0, param1);
; CHECK-PTX-NEXT: } // callseq 3
; CHECK-PTX-NEXT: ret;
entry:
%s = alloca %struct.S2, align 8
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %s, ptr align 8 @__const.qux.s, i64 16, i1 false)
%call = call i32 (ptr, ...) @variadics4(ptr noundef byval(%struct.S2) align 8 %s, i64 noundef 1)
ret void
}