[NVPTXLowerArgs] Add align attribute to return value of addrspace.wrap intrinsic (#153889)

If alignment inference happens after NVPTXLowerArgs these addrspace wrap
intrinsics can prevent computeKnownBits from deriving alignment of
loads/stores from parameters. To solve this, we can insert an alignment
annotation on the generated intrinsic so that computeKnownBits does not
need to traverse through it to find the alignment.
This commit is contained in:
Drew Kersnar 2025-08-19 11:13:57 -05:00 committed by GitHub
parent 67ca5dad16
commit 069ad2353c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 96 additions and 44 deletions

View File

@ -1027,9 +1027,16 @@ static inline bool isAddLike(const SDValue V) {
(V->getOpcode() == ISD::OR && V->getFlags().hasDisjoint());
}
static SDValue stripAssertAlign(SDValue N) {
if (N.getOpcode() == ISD::AssertAlign)
N = N.getOperand(0);
return N;
}
// selectBaseADDR - Match a dag node which will serve as the base address for an
// ADDR operand pair.
static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
N = stripAssertAlign(N);
if (const auto *GA = dyn_cast<GlobalAddressSDNode>(N))
return DAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N),
GA->getValueType(0), GA->getOffset(),
@ -1044,6 +1051,7 @@ static SDValue selectBaseADDR(SDValue N, SelectionDAG *DAG) {
}
static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
Addr = stripAssertAlign(Addr);
APInt AccumulatedOffset(64u, 0);
while (isAddLike(Addr)) {
const auto *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
@ -1055,7 +1063,7 @@ static SDValue accumulateOffset(SDValue &Addr, SDLoc DL, SelectionDAG *DAG) {
break;
AccumulatedOffset += CI;
Addr = Addr->getOperand(0);
Addr = stripAssertAlign(Addr->getOperand(0));
}
return DAG->getSignedTargetConstant(AccumulatedOffset.getSExtValue(), DL,
MVT::i32);

View File

@ -412,6 +412,22 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
}
}
// Create a call to the nvvm_internal_addrspace_wrap intrinsic and set the
// alignment of the return value based on the alignment of the argument.
static CallInst *createNVVMInternalAddrspaceWrap(IRBuilder<> &IRB,
Argument &Arg) {
CallInst *ArgInParam =
IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
&Arg, {}, Arg.getName() + ".param");
if (MaybeAlign ParamAlign = Arg.getParamAlign())
ArgInParam->addRetAttr(
Attribute::getWithAlignment(ArgInParam->getContext(), *ParamAlign));
return ArgInParam;
}
namespace {
struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
using Base = PtrUseVisitor<ArgUseChecker>;
@ -515,10 +531,7 @@ void copyByValParam(Function &F, Argument &Arg) {
Arg.getParamAlign().value_or(DL.getPrefTypeAlign(StructType)));
Arg.replaceAllUsesWith(AllocA);
Value *ArgInParam =
IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg.getType()},
&Arg, {}, Arg.getName());
CallInst *ArgInParam = createNVVMInternalAddrspaceWrap(IRB, Arg);
// Be sure to propagate alignment to this load; LLVM doesn't know that NVPTX
// addrspacecast preserves alignment. Since params are constant, this load
@ -549,9 +562,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
SmallVector<Use *, 16> UsesToUpdate(llvm::make_pointer_range(Arg->uses()));
IRBuilder<> IRB(&*FirstInst);
Value *ArgInParamAS = IRB.CreateIntrinsic(
Intrinsic::nvvm_internal_addrspace_wrap,
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()}, {Arg});
CallInst *ArgInParamAS = createNVVMInternalAddrspaceWrap(IRB, *Arg);
for (Use *U : UsesToUpdate)
convertToParamAS(U, ArgInParamAS, HasCvtaParam, IsGridConstant);
@ -581,10 +592,7 @@ static void handleByValParam(const NVPTXTargetMachine &TM, Argument *Arg) {
// argument already in the param address space, we need to use the noop
// intrinsic, this had the added benefit of preventing other optimizations
// from folding away this pair of addrspacecasts.
auto *ParamSpaceArg =
IRB.CreateIntrinsic(Intrinsic::nvvm_internal_addrspace_wrap,
{IRB.getPtrTy(ADDRESS_SPACE_PARAM), Arg->getType()},
Arg, {}, Arg->getName() + ".param");
auto *ParamSpaceArg = createNVVMInternalAddrspaceWrap(IRB, *Arg);
// Cast param address to generic address space.
Value *GenericArg = IRB.CreateAddrSpaceCast(

View File

@ -0,0 +1,36 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=nvptx-lower-args,infer-alignment -S | FileCheck %s
target triple = "nvptx64-nvidia-cuda"
; ------------------------------------------------------------------------------
; Test that alignment can be inferred through llvm.nvvm.internal.addrspace.wrap.p101.p0 intrinsics
; thanks to the alignment attribute on the intrinsic
; ------------------------------------------------------------------------------
%struct.S1 = type { i32, i32, i32, i32 }
define ptx_kernel i32 @test_align8(ptr noundef readonly byval(%struct.S1) align 8 captures(none) %params) {
; CHECK-LABEL: define ptx_kernel i32 @test_align8(
; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 8 captures(none) [[PARAMS:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]])
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 8
; CHECK-NEXT: ret i32 [[LOAD]]
;
entry:
%load = load i32, ptr %params, align 4
ret i32 %load
}
define ptx_kernel i32 @test_align1(ptr noundef readonly byval(%struct.S1) align 1 captures(none) %params) {
; CHECK-LABEL: define ptx_kernel i32 @test_align1(
; CHECK-SAME: ptr noundef readonly byval([[STRUCT_S1:%.*]]) align 4 captures(none) [[PARAMS:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = call align 1 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[PARAMS]])
; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr addrspace(101) [[TMP0]], align 4
; CHECK-NEXT: ret i32 [[LOAD]]
;
entry:
%load = load i32, ptr %params, align 4
ret i32 %load
}

View File

@ -72,7 +72,7 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_int(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT11:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
@ -101,7 +101,7 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
@ -137,7 +137,7 @@ define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void
@ -180,9 +180,9 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
@ -208,7 +208,7 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
; OPT-NEXT: ret void
@ -235,7 +235,7 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4
; PTX-NOT .local
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
@ -357,7 +357,7 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr
; PTX-NEXT: ret;
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@ -416,7 +416,7 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1,
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@ -471,7 +471,7 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[TMP2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
@ -520,7 +520,7 @@ declare void @device_func(ptr byval(i32) align 4)
define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr
; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
; OPT-NEXT: ret void

View File

@ -200,7 +200,7 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) {
define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%struct.S) align 8 %s, i32 noundef %v) {
; IRC-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRC-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
; IRC-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRC-NEXT: [[S3:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRC-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S3]], align 8
; IRC-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRC-NEXT: [[P1:%.*]] = addrspacecast ptr [[P]] to ptr addrspace(1)
@ -210,7 +210,7 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st
;
; IRO-LABEL: define ptx_kernel void @ptr_as_int_aggr(
; IRO-SAME: ptr noundef readonly byval([[STRUCT_S:%.*]]) align 8 captures(none) [[S:%.*]], i32 noundef [[V:%.*]]) {
; IRO-NEXT: [[S1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRO-NEXT: [[S1:%.*]] = call align 8 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; IRO-NEXT: [[I:%.*]] = load i64, ptr addrspace(101) [[S1]], align 8
; IRO-NEXT: [[P:%.*]] = inttoptr i64 [[I]] to ptr
; IRO-NEXT: store i32 [[V]], ptr [[P]], align 4

View File

@ -32,7 +32,7 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[S3]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
; LOWER-ARGS-NEXT: ret void
@ -66,7 +66,7 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @read_only_gep(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[B4:%.*]] = getelementptr inbounds i8, ptr addrspace(101) [[S3]], i64 4
; LOWER-ARGS-NEXT: [[I:%.*]] = load i32, ptr addrspace(101) [[B4]], align 4
; LOWER-ARGS-NEXT: store i32 [[I]], ptr [[OUT]], align 4
@ -128,7 +128,7 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out
; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[S1]]) #[[ATTR6:[0-9]+]]
; COMMON-NEXT: ret void
@ -167,7 +167,7 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone
; COMMON-SAME: ptr noundef readnone captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT: call void @_Z6escapePv(ptr noundef nonnull [[B]]) #[[ATTR6]]
@ -209,7 +209,7 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: store ptr [[S1]], ptr [[OUT]], align 8
; COMMON-NEXT: ret void
@ -246,7 +246,7 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[B:%.*]] = getelementptr inbounds nuw i8, ptr [[S1]], i64 4
; COMMON-NEXT: store ptr [[B]], ptr [[OUT]], align 8
@ -286,7 +286,7 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl
; COMMON-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = ptrtoint ptr [[S1]] to i64
; COMMON-NEXT: store i64 [[I]], ptr [[OUT]], align 8
@ -324,7 +324,7 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo
; LOWER-ARGS-LABEL: define dso_local ptx_kernel void @memcpy_from_param(
; LOWER-ARGS-SAME: ptr noundef writeonly captures(none) [[OUT:%.*]], ptr noundef readonly byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; LOWER-ARGS-NEXT: [[ENTRY:.*:]]
; LOWER-ARGS-NEXT: [[S3:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: [[S3:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; LOWER-ARGS-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr [[OUT]], ptr addrspace(101) [[S3]], i64 16, i1 true)
; LOWER-ARGS-NEXT: ret void
;
@ -445,7 +445,7 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[ENTRY:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr [[S1]], ptr [[IN]], i64 16, i1 true)
; COMMON-NEXT: ret void
@ -525,7 +525,7 @@ define dso_local ptx_kernel void @copy_on_store(ptr nocapture noundef readonly %
; COMMON-SAME: ptr noundef readonly captures(none) [[IN:%.*]], ptr noundef byval([[STRUCT_S:%.*]]) align 4 captures(none) [[S:%.*]], i1 noundef zeroext [[B:%.*]]) local_unnamed_addr #[[ATTR0]] {
; COMMON-NEXT: [[BB:.*:]]
; COMMON-NEXT: [[S1:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[S2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: [[S2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[S]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[S1]], ptr addrspace(101) align 4 [[S2]], i64 8, i1 false)
; COMMON-NEXT: [[I:%.*]] = load i32, ptr [[IN]], align 4
; COMMON-NEXT: store i32 [[I]], ptr [[S1]], align 4
@ -551,7 +551,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT24]], ptr addrspace(101) align 4 [[INPUT25]], i64 4, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; SM_60-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT24]]
; SM_60-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@ -563,7 +563,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; SM_70-NEXT: [[BB:.*:]]
; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
; SM_70-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@ -577,7 +577,7 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3
; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COPY-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COPY-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
@ -637,7 +637,7 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by
; COMMON-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT23]], ptr addrspace(101) align 4 [[INPUT24]], i64 4, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca i32, align 4
; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 4, i1 false)
; COMMON-NEXT: [[PTRNEW:%.*]] = select i1 [[COND]], ptr [[INPUT11]], ptr [[INPUT23]]
; COMMON-NEXT: store i32 1, ptr [[PTRNEW]], align 4
@ -682,7 +682,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_60-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; SM_60-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; SM_60-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_60-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; SM_60-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_60: [[FIRST]]:
@ -702,7 +702,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; SM_70-NEXT: [[BB:.*:]]
; SM_70-NEXT: [[TMP0:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; SM_70-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP0]] to ptr
; SM_70-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT: [[TMP1:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; SM_70-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
; SM_70-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; SM_70: [[FIRST]]:
@ -724,7 +724,7 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval
; COPY-NEXT: [[INPUT24:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT23]], ptr addrspace(101) align 8 [[INPUT24]], i64 8, i1 false)
; COPY-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; COPY-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COPY-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COPY-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COPY: [[FIRST]]:
@ -808,7 +808,7 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr
; COMMON-NEXT: [[INPUT25:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 8 [[INPUT24]], ptr addrspace(101) align 8 [[INPUT25]], i64 8, i1 false)
; COMMON-NEXT: [[INPUT11:%.*]] = alloca [[STRUCT_S]], align 4
; COMMON-NEXT: [[INPUT12:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: [[INPUT12:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT11]], ptr addrspace(101) align 4 [[INPUT12]], i64 8, i1 false)
; COMMON-NEXT: br i1 [[COND]], label %[[FIRST:.*]], label %[[SECOND:.*]]
; COMMON: [[FIRST]]:
@ -871,7 +871,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
; COMMON-LABEL: define ptx_kernel void @test_forward_byval_arg(
; COMMON-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR3]] {
; COMMON-NEXT: [[INPUT1:%.*]] = alloca i32, align 4
; COMMON-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; COMMON-NEXT: [[INPUT2:%.*]] = call align 4 ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
; COMMON-NEXT: call void @llvm.memcpy.p0.p101.i64(ptr align 4 [[INPUT1]], ptr addrspace(101) align 4 [[INPUT2]], i64 4, i1 false)
; COMMON-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT1]])
; COMMON-NEXT: ret void