
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the same API and "gather from a pointer to LDS" is something of an abstract operation. This commit adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang.
225 lines
14 KiB
LLVM
225 lines
14 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=infer-address-spaces %s | FileCheck %s
|
|
|
|
define amdgpu_kernel void @memset_group_to_flat(ptr addrspace(3) %group.ptr, i32 %y) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memset_group_to_flat(
|
|
; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], i8 4, i64 32, i1 false), !tbaa [[TBAA0:![0-9]+]], !alias.scope [[META3:![0-9]+]], !noalias [[META6:![0-9]+]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
|
|
call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memset_global_to_flat(ptr addrspace(1) %global.ptr, i32 %y) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memset_global_to_flat(
|
|
; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i32 [[Y:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[GLOBAL_PTR]], i8 4, i64 32, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
|
|
call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memset_group_to_flat_no_md(ptr addrspace(3) %group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memset_group_to_flat_no_md(
|
|
; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memset.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], i8 4, i64 [[SIZE]], i1 false)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
|
|
call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 %size, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memset_global_to_flat_no_md(ptr addrspace(1) %global.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memset_global_to_flat_no_md(
|
|
; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) align 4 [[GLOBAL_PTR]], i8 4, i64 [[SIZE]], i1 false)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
|
|
call void @llvm.memset.p0.i64(ptr align 4 %cast, i8 4, i64 %size, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(
|
|
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(
|
|
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 42, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memcpy.inline.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(ptr addrspace(3) %dest.group.ptr, ptr %src.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(
|
|
; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr [[SRC_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p3.p0.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr align 4 [[SRC_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast.dest, ptr align 4 %src.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(ptr addrspace(3) %dest.group.ptr, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(
|
|
; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
%cast.dest = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast.dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(ptr addrspace(3) %dest.group.ptr, ptr addrspace(1) %src.global.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(
|
|
; CHECK-SAME: ptr addrspace(3) [[DEST_GROUP_PTR:%.*]], ptr addrspace(1) [[SRC_GLOBAL_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p3.p1.i64(ptr addrspace(3) align 4 [[DEST_GROUP_PTR]], ptr addrspace(1) align 4 [[SRC_GLOBAL_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(1) %src.global.ptr to ptr
|
|
%cast.dest = addrspacecast ptr addrspace(3) %dest.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast.dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(ptr addrspace(1) %dest.global.ptr, ptr addrspace(3) %src.group.ptr, i32 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(
|
|
; CHECK-SAME: ptr addrspace(1) [[DEST_GLOBAL_PTR:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i32 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p1.p3.i32(ptr addrspace(1) align 4 [[DEST_GLOBAL_PTR]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i32 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.dest = addrspacecast ptr addrspace(1) %dest.global.ptr to ptr
|
|
call void @llvm.memcpy.p0.p3.i32(ptr align 4 %cast.dest, ptr addrspace(3) align 4 %src.group.ptr, i32 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
|
|
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa.struct !8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(
|
|
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(ptr %dest0, ptr %dest1, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
|
|
; CHECK-SAME: ptr [[DEST0:%.*]], ptr [[DEST1:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST0]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
|
|
; CHECK-NEXT: call void @llvm.memcpy.p0.p3.i64(ptr align 4 [[DEST1]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest0, ptr align 4 %cast.src, i64 %size, i1 false)
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %dest1, ptr align 4 %cast.src, i64 %size, i1 false)
|
|
ret void
|
|
}
|
|
|
|
; Check for iterator problems if the pointer has 2 uses in the same call
|
|
define amdgpu_kernel void @memcpy_group_flat_to_flat_self(ptr addrspace(3) %group.ptr) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memcpy_group_flat_to_flat_self(
|
|
; CHECK-SAME: ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) align 4 [[GROUP_PTR]], ptr addrspace(3) align 4 [[GROUP_PTR]], i64 32, i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(3) %group.ptr to ptr
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %cast, ptr align 4 %cast, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(ptr %dest, ptr addrspace(3) %src.group.ptr, i64 %size) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(
|
|
; CHECK-SAME: ptr [[DEST:%.*]], ptr addrspace(3) [[SRC_GROUP_PTR:%.*]], i64 [[SIZE:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.memmove.p0.p3.i64(ptr align 4 [[DEST]], ptr addrspace(3) align 4 [[SRC_GROUP_PTR]], i64 [[SIZE]], i1 false), !tbaa [[TBAA0]], !alias.scope [[META3]], !noalias [[META6]]
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast.src = addrspacecast ptr addrspace(3) %src.group.ptr to ptr
|
|
call void @llvm.memmove.p0.p0.i64(ptr align 4 %dest, ptr align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !6
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @load_to_lds_global_as_flat(ptr addrspace(1) %global.ptr, ptr addrspace(3) %group.ptr) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @load_to_lds_global_as_flat(
|
|
; CHECK-SAME: ptr addrspace(1) [[GLOBAL_PTR:%.*]], ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p1(ptr addrspace(1) [[GLOBAL_PTR]], ptr addrspace(3) [[GROUP_PTR]], i32 4, i32 0, i32 0)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(1) %global.ptr to ptr
|
|
call void @llvm.amdgcn.load.to.lds.p0(ptr %cast, ptr addrspace(3) %group.ptr, i32 4, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @load_to_lds_fat_pointer_as_flat(ptr addrspace(7) %buffer.fat.ptr, ptr addrspace(3) %group.ptr) #0 {
|
|
; CHECK-LABEL: define amdgpu_kernel void @load_to_lds_fat_pointer_as_flat(
|
|
; CHECK-SAME: ptr addrspace(7) [[BUFFER_FAT_PTR:%.*]], ptr addrspace(3) [[GROUP_PTR:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p7(ptr addrspace(7) [[BUFFER_FAT_PTR]], ptr addrspace(3) [[GROUP_PTR]], i32 4, i32 0, i32 0)
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%cast = addrspacecast ptr addrspace(7) %buffer.fat.ptr to ptr
|
|
call void @llvm.amdgcn.load.to.lds.p0(ptr %cast, ptr addrspace(3) %group.ptr, i32 4, i32 0, i32 0)
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #1
|
|
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
|
|
declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
|
|
declare void @llvm.memcpy.p0.p3.i32(ptr nocapture writeonly, ptr addrspace(3) nocapture readonly, i32, i1) #1
|
|
declare void @llvm.memmove.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
|
|
declare void @llvm.amdgcn.load.to.lds.p0(ptr nocapture readonly, ptr addrspace(3) nocapture writeonly, i32 immarg, i32 immarg, i32 immarg) #1
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { argmemonly nounwind }
|
|
|
|
!0 = !{!1, !1, i64 0}
|
|
!1 = !{!"A", !2}
|
|
!2 = !{!"tbaa root"}
|
|
!3 = !{!4}
|
|
!4 = distinct !{!4, !5, !"some scope 1"}
|
|
!5 = distinct !{!5, !"some domain"}
|
|
!6 = !{!7}
|
|
!7 = distinct !{!7, !5, !"some scope 2"}
|
|
!8 = !{i64 0, i64 8, null}
|
|
;.
|
|
; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
|
|
; CHECK: [[META1]] = !{!"A", [[META2:![0-9]+]]}
|
|
; CHECK: [[META2]] = !{!"tbaa root"}
|
|
; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
|
|
; CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]], !"some scope 1"}
|
|
; CHECK: [[META5]] = distinct !{[[META5]], !"some domain"}
|
|
; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
|
|
; CHECK: [[META7]] = distinct !{[[META7]], [[META5]], !"some scope 2"}
|
|
; CHECK: [[TBAA_STRUCT8]] = !{i64 0, i64 8, null}
|
|
;.
|