llvm-project/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
Alex MacLean d494eb0fa3
[NVPTX] Skip numbering unreferenced virtual registers (readability) (#154391)
When assigning numbers to registers, skip any with neither uses nor
defs. This is will not have any impact at all on the final SASS but it
makes for slightly more readable PTX. This change should also ensure
that future minor changes are less likely to cause noisy diffs in
register numbering.
2025-08-19 12:27:46 -07:00

590 lines
26 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
%struct.uint4 = type { i32, i32, i32, i32 }
@gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
; Regular functions mus still make a copy. `cvta.param` does not always work there.
define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 {
; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; OPT-NEXT: [[ENTRY:.*:]]
; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1
; OPT-NEXT: ret i32 [[TMP0]]
;
; PTX-LABEL: non_kernel_function(
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b16 %rs<3>;
; PTX-NEXT: .reg .b32 %r<11>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0: // %entry
; PTX-NEXT: mov.b64 %rd1, non_kernel_function_param_0;
; PTX-NEXT: cvta.local.u64 %rd2, %rd1;
; PTX-NEXT: ld.param.b8 %rs1, [non_kernel_function_param_1];
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
; PTX-NEXT: mov.b64 %rd3, gi;
; PTX-NEXT: cvta.global.u64 %rd4, %rd3;
; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1;
; PTX-NEXT: ld.param.s32 %rd6, [non_kernel_function_param_2];
; PTX-NEXT: add.s64 %rd7, %rd5, %rd6;
; PTX-NEXT: ld.b8 %r1, [%rd7];
; PTX-NEXT: ld.b8 %r2, [%rd7+1];
; PTX-NEXT: shl.b32 %r3, %r2, 8;
; PTX-NEXT: or.b32 %r4, %r3, %r1;
; PTX-NEXT: ld.b8 %r5, [%rd7+2];
; PTX-NEXT: shl.b32 %r6, %r5, 16;
; PTX-NEXT: ld.b8 %r7, [%rd7+3];
; PTX-NEXT: shl.b32 %r8, %r7, 24;
; PTX-NEXT: or.b32 %r9, %r8, %r6;
; PTX-NEXT: or.b32 %r10, %r9, %r4;
; PTX-NEXT: st.param.b32 [func_retval0], %r10;
; PTX-NEXT: ret;
entry:
%a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17
%idx.ext = sext i32 %c to i64, !dbg !18
%add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18
%0 = load i32, ptr %add.ptr, align 1, !dbg !19
ret i32 %0, !dbg !23
}
define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
; PTX-LABEL: grid_const_int(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [grid_const_int_param_2];
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT: ld.param.b32 %r1, [grid_const_int_param_1];
; PTX-NEXT: ld.param.b32 %r2, [grid_const_int_param_0];
; PTX-NEXT: add.s32 %r3, %r2, %r1;
; PTX-NEXT: st.global.b32 [%rd2], %r3;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_int(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT11:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
; OPT-NEXT: ret void
%tmp = load i32, ptr %input1, align 4
%add = add i32 %tmp, %input2
store i32 %add, ptr %out
ret void
}
%struct.s = type { i32, i32 }
define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
; PTX-LABEL: grid_const_struct(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: ld.param.b64 %rd1, [grid_const_struct_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
; PTX-NEXT: ld.param.b32 %r1, [grid_const_struct_param_0];
; PTX-NEXT: ld.param.b32 %r2, [grid_const_struct_param_0+4];
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: st.global.b32 [%rd2], %r3;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
; OPT-NEXT: ret void
%gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%int1 = load i32, ptr %gep1
%int2 = load i32, ptr %gep2
%add = add i32 %int1, %int2
store i32 %add, ptr %out
ret void
}
define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-LABEL: grid_const_escape(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_escape_param_0;
; PTX-NEXT: cvta.param.u64 %rd2, %rd1;
; PTX-NEXT: { // callseq 0, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: st.param.b64 [param0], %rd2;
; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: mov.b64 %rd3, escape;
; PTX-NEXT: call (retval0), %rd3, (param0), prototype_0;
; PTX-NEXT: } // callseq 0
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
%call = call i32 @escape(ptr %input)
ret void
}
define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
; PTX-LABEL: multiple_grid_const_escape(
; PTX: {
; PTX-NEXT: .local .align 4 .b8 __local_depot4[4];
; PTX-NEXT: .reg .b64 %SP;
; PTX-NEXT: .reg .b64 %SPL;
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-NEXT: .reg .b64 %rd<8>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %SPL, __local_depot4;
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
; PTX-NEXT: mov.b64 %rd1, multiple_grid_const_escape_param_0;
; PTX-NEXT: ld.param.b32 %r1, [multiple_grid_const_escape_param_1];
; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_2;
; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
; PTX-NEXT: add.u64 %rd5, %SP, 0;
; PTX-NEXT: add.u64 %rd6, %SPL, 0;
; PTX-NEXT: st.local.b32 [%rd6], %r1;
; PTX-NEXT: { // callseq 1, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: .param .b64 param1;
; PTX-NEXT: .param .b64 param2;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: st.param.b64 [param2], %rd3;
; PTX-NEXT: st.param.b64 [param1], %rd5;
; PTX-NEXT: st.param.b64 [param0], %rd4;
; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
; PTX-NEXT: mov.b64 %rd7, escape3;
; PTX-NEXT: call (retval0), %rd7, (param0, param1, param2), prototype_1;
; PTX-NEXT: } // callseq 1
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
; OPT-NEXT: ret void
%a.addr = alloca i32, align 4
store i32 %a, ptr %a.addr, align 4
%call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
ret void
}
define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
; PTX-LABEL: grid_const_memory_escape(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0;
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_memory_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
; PTX-NEXT: st.global.b64 [%rd3], %rd4;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
; OPT-NEXT: ret void
store ptr %input, ptr %addr, align 8
ret void
}
define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
; PTX-LABEL: grid_const_inlineasm_escape(
; PTX: {
; PTX-NEXT: .reg .b64 %rd<7>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
; PTX-NEXT: ld.param.b64 %rd5, [grid_const_inlineasm_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5;
; PTX-NEXT: cvta.param.u64 %rd2, %rd4;
; PTX-NEXT: add.s64 %rd3, %rd2, 4;
; PTX-NEXT: // begin inline asm
; PTX-NEXT: add.s64 %rd1, %rd2, %rd3;
; PTX-NEXT: // end inline asm
; PTX-NEXT: st.global.b64 [%rd6], %rd1;
; PTX-NEXT: ret;
; PTX-NOT .local
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT]], align 8
; OPT-NEXT: ret void
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
store i64 %1, ptr %result, align 8
ret void
}
define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escape(
; PTX: {
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escape_param_0;
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_partial_escape_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escape_param_0];
; PTX-NEXT: add.s32 %r2, %r1, %r1;
; PTX-NEXT: st.global.b32 [%rd3], %r2;
; PTX-NEXT: { // callseq 2, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: st.param.b64 [param0], %rd4;
; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: mov.b64 %rd5, escape;
; PTX-NEXT: call (retval0), %rd5, (param0), prototype_2;
; PTX-NEXT: } // callseq 2
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
; OPT-NEXT: ret void
%val = load i32, ptr %input
%twice = add i32 %val, %val
store i32 %twice, ptr %output
%call = call i32 @escape(ptr %input)
ret void
}
define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
; PTX-LABEL: grid_const_partial_escapemem(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_partial_escapemem_param_0;
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_partial_escapemem_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escapemem_param_0];
; PTX-NEXT: ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4];
; PTX-NEXT: st.global.b64 [%rd3], %rd4;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: { // callseq 3, 0
; PTX-NEXT: .param .b64 param0;
; PTX-NEXT: .param .b32 retval0;
; PTX-NEXT: st.param.b64 [param0], %rd4;
; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
; PTX-NEXT: mov.b64 %rd5, escape;
; PTX-NEXT: call (retval0), %rd5, (param0), prototype_3;
; PTX-NEXT: } // callseq 3
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT]], align 8
; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
; OPT-NEXT: ret i32 [[ADD]]
%ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
%val1 = load i32, ptr %ptr1
%ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
%val2 = load i32, ptr %ptr2
store ptr %input, ptr %output
%add = add i32 %val1, %val2
%call2 = call i32 @escape(ptr %ptr1)
ret i32 %add
}
define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
; PTX-LABEL: grid_const_phi(
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<4>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd3, grid_const_phi_param_0;
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_phi_param_1];
; PTX-NEXT: cvta.to.global.u64 %rd1, %rd2;
; PTX-NEXT: ld.global.b32 %r1, [%rd1];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: @%p1 bra $L__BB9_2;
; PTX-NEXT: // %bb.1: // %second
; PTX-NEXT: add.s64 %rd3, %rd3, 4;
; PTX-NEXT: $L__BB9_2: // %merge
; PTX-NEXT: ld.param.b32 %r2, [%rd3];
; PTX-NEXT: st.global.b32 [%rd1], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
; OPT-NEXT: br label %[[MERGE:.*]]
; OPT: [[SECOND]]:
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
; OPT-NEXT: br label %[[MERGE]]
; OPT: [[MERGE]]:
; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
br i1 %less, label %first, label %second
first:
%ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
br label %merge
second:
%ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1
br label %merge
merge:
%ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
%valloaded = load i32, ptr %ptrnew
store i32 %valloaded, ptr %inout
ret void
}
; NOTE: %input2 is *not* grid_constant
define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
; PTX-LABEL: grid_const_phi_ngc(
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<5>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd4, grid_const_phi_ngc_param_0;
; PTX-NEXT: ld.param.b64 %rd3, [grid_const_phi_ngc_param_2];
; PTX-NEXT: cvta.to.global.u64 %rd1, %rd3;
; PTX-NEXT: ld.global.b32 %r1, [%rd1];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: @%p1 bra $L__BB10_2;
; PTX-NEXT: // %bb.1: // %second
; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1;
; PTX-NEXT: add.s64 %rd4, %rd2, 4;
; PTX-NEXT: $L__BB10_2: // %merge
; PTX-NEXT: ld.param.b32 %r2, [%rd4];
; PTX-NEXT: st.global.b32 [%rd1], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; OPT: [[FIRST]]:
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
; OPT-NEXT: br label %[[MERGE:.*]]
; OPT: [[SECOND]]:
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
; OPT-NEXT: br label %[[MERGE]]
; OPT: [[MERGE]]:
; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
br i1 %less, label %first, label %second
first:
%ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
br label %merge
second:
%ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1
br label %merge
merge:
%ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
%valloaded = load i32, ptr %ptrnew
store i32 %valloaded, ptr %inout
ret void
}
; NOTE: %input2 is *not* grid_constant
define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
; PTX-LABEL: grid_const_select(
; PTX: {
; PTX-NEXT: .reg .pred %p<2>;
; PTX-NEXT: .reg .b32 %r<3>;
; PTX-NEXT: .reg .b64 %rd<6>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0;
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_select_param_2];
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1;
; PTX-NEXT: ld.global.b32 %r1, [%rd3];
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
; PTX-NEXT: ld.param.b32 %r2, [%rd5];
; PTX-NEXT: st.global.b32 [%rd3], %r2;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_select(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
; OPT-NEXT: ret void
%val = load i32, ptr %inout
%less = icmp slt i32 %val, 0
%ptrnew = select i1 %less, ptr %input1, ptr %input2
%valloaded = load i32, ptr %ptrnew
store i32 %valloaded, ptr %inout
ret void
}
define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
; PTX-LABEL: grid_const_ptrtoint(
; PTX: {
; PTX-NEXT: .reg .b32 %r<4>;
; PTX-NEXT: .reg .b64 %rd<3>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0;
; PTX-NEXT: ld.param.b32 %r1, [grid_const_ptrtoint_param_0];
; PTX-NEXT: cvta.param.u64 %rd2, %rd1;
; PTX-NEXT: cvt.u32.u64 %r2, %rd2;
; PTX-NEXT: add.s32 %r3, %r1, %r2;
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr
; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
; OPT-NEXT: ret i32 [[KEEPALIVE]]
%val = load i32, ptr %input
%ptrval = ptrtoint ptr %input to i32
%keepalive = add i32 %val, %ptrval
ret i32 %keepalive
}
declare void @device_func(ptr byval(i32) align 4)
define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr
; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
;
; PTX-LABEL: test_forward_byval_arg(
; PTX: {
; PTX-NEXT: .reg .b32 %r<2>;
; PTX-EMPTY:
; PTX-NEXT: // %bb.0:
; PTX-NEXT: { // callseq 4, 0
; PTX-NEXT: .param .align 4 .b8 param0[4];
; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0];
; PTX-NEXT: st.param.b32 [param0], %r1;
; PTX-NEXT: call.uni device_func, (param0);
; PTX-NEXT: } // callseq 4
; PTX-NEXT: ret;
call void @device_func(ptr byval(i32) align 4 %input)
ret void
}
declare dso_local void @dummy() local_unnamed_addr
declare dso_local ptr @escape(ptr) local_unnamed_addr
declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24}
!0 = !{ptr @grid_const_int, !"grid_constant", !1}
!1 = !{i32 1}
!2 = !{ptr @grid_const_struct, !"grid_constant", !3}
!3 = !{i32 1}
!4 = !{ptr @grid_const_escape, !"grid_constant", !5}
!5 = !{i32 1}
!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7}
!7 = !{i32 1, i32 3}
!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9}
!9 = !{i32 1}
!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11}
!11 = !{i32 1}
!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13}
!13 = !{i32 1}
!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15}
!15 = !{i32 1}
!16 = !{ptr @grid_const_phi, !"grid_constant", !17}
!17 = !{i32 1}
!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19}
!19 = !{i32 1}
!20 = !{ptr @grid_const_select, !"grid_constant", !21}
!21 = !{i32 1}
!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
!23 = !{i32 1}
!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25}
!25 = !{i32 1}