
This PR adds a amdgns_load_to_lds intrinsic that abstracts over loads to LDS from global (address space 1) pointers and buffer fat pointers (address space 7), since they use the same API and "gather from a pointer to LDS" is something of an abstract operation. This commit adds the intrinsic and its lowerings for addrspaces 1 and 7, and updates the MLIR wrappers to use it (loosening up the restrictions on loads to LDS along the way to match the ground truth from target features). It also plumbs the intrinsic through to clang.
68 lines
3.9 KiB
Plaintext
68 lines
3.9 KiB
Plaintext
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
|
|
// REQUIRES: amdgpu-registered-target
|
|
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx950 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
|
|
|
|
// COM: Most tests are in the OpenCL semastics, this is just a verification for HIP
|
|
|
|
#define __device__ __attribute__((device))
|
|
#define __shared__ __attribute__((shared))
|
|
|
|
typedef unsigned int u32;
|
|
|
|
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_u32PjS_(
|
|
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
// CHECK-NEXT: [[ENTRY:.*:]]
|
|
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
|
|
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
|
|
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
|
|
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
|
|
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
|
|
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
|
|
// CHECK-NEXT: ret void
|
|
//
|
|
__device__ void test_load_to_lds_u32(u32* src, __shared__ u32 *dst) {
|
|
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
|
|
}
|
|
|
|
// CHECK-LABEL: define dso_local void @_Z30test_load_to_lds_u32_flat_destPjS_(
|
|
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
|
|
// CHECK-NEXT: [[ENTRY:.*:]]
|
|
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
|
|
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
|
|
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
|
|
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
|
|
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
|
|
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 4, i32 0, i32 0)
|
|
// CHECK-NEXT: ret void
|
|
//
|
|
__device__ void test_load_to_lds_u32_flat_dest(u32* src, u32 *dst) {
|
|
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/4, /*offset=*/0, /*aux=*/0);
|
|
}
|
|
|
|
// CHECK-LABEL: define dso_local void @_Z20test_load_to_lds_128PvS_(
|
|
// CHECK-SAME: ptr noundef [[SRC:%.*]], ptr noundef [[DST:%.*]]) #[[ATTR0]] {
|
|
// CHECK-NEXT: [[ENTRY:.*:]]
|
|
// CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
|
|
// CHECK-NEXT: [[DST_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
|
|
// CHECK-NEXT: [[SRC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SRC_ADDR]] to ptr
|
|
// CHECK-NEXT: [[DST_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DST_ADDR]] to ptr
|
|
// CHECK-NEXT: store ptr [[SRC]], ptr [[SRC_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: store ptr [[DST]], ptr [[DST_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SRC_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DST_ADDR_ASCAST]], align 8
|
|
// CHECK-NEXT: [[TMP2:%.*]] = addrspacecast ptr [[TMP1]] to ptr addrspace(3)
|
|
// CHECK-NEXT: call void @llvm.amdgcn.load.to.lds.p0(ptr [[TMP0]], ptr addrspace(3) [[TMP2]], i32 16, i32 0, i32 0)
|
|
// CHECK-NEXT: ret void
|
|
//
|
|
__device__ void test_load_to_lds_128(void* src, __shared__ void *dst) {
|
|
__builtin_amdgcn_load_to_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0);
|
|
}
|