
Note: This relands #140615 adding a ".count" suffix to the non-".all" variants. Our current intrinsic support for barrier intrinsics is confusing and incomplete, with multiple intrinsics mapping to the same instruction and intrinsic names not clearly conveying intrinsic semantics. Further, we lack support for some variants. This change unifies the IR representation to a single consistently named set of intrinsics. - llvm.nvvm.barrier.cta.sync.aligned.all(i32) - llvm.nvvm.barrier.cta.sync.aligned.count(i32, i32) - llvm.nvvm.barrier.cta.arrive.aligned.count(i32, i32) - llvm.nvvm.barrier.cta.sync.all(i32) - llvm.nvvm.barrier.cta.sync.count(i32, i32) - llvm.nvvm.barrier.cta.arrive.count(i32, i32) The following Auto-Upgrade rules are used to maintain compatibility with IR using the legacy intrinsics: * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned.count(x, y) * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync.count(x, y)
986 lines
51 KiB
C
986 lines
51 KiB
C
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 5
|
|
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
|
|
// RUN: -internal-isystem %S/../../lib/Headers/ \
|
|
// RUN: -triple amdgcn-amd-amdhsa -emit-llvm %s -o - \
|
|
// RUN: | FileCheck %s --check-prefix=AMDGPU
|
|
//
|
|
// RUN: %clang_cc1 -internal-isystem %S/Inputs/include \
|
|
// RUN: -internal-isystem %S/../../lib/Headers/ \
|
|
// RUN: -target-feature +ptx62 \
|
|
// RUN: -triple nvptx64-nvidia-cuda -emit-llvm %s -o - \
|
|
// RUN: | FileCheck %s --check-prefix=NVPTX
|
|
|
|
#include <gpuintrin.h>
|
|
|
|
__gpu_kernel void foo() {
|
|
__gpu_num_blocks_x();
|
|
__gpu_num_blocks_y();
|
|
__gpu_num_blocks_z();
|
|
__gpu_num_blocks(0);
|
|
__gpu_block_id_x();
|
|
__gpu_block_id_y();
|
|
__gpu_block_id_z();
|
|
__gpu_block_id(0);
|
|
__gpu_num_threads_x();
|
|
__gpu_num_threads_y();
|
|
__gpu_num_threads_z();
|
|
__gpu_num_threads(0);
|
|
__gpu_thread_id_x();
|
|
__gpu_thread_id_y();
|
|
__gpu_thread_id_z();
|
|
__gpu_thread_id(0);
|
|
__gpu_num_lanes();
|
|
__gpu_lane_id();
|
|
__gpu_lane_mask();
|
|
__gpu_read_first_lane_u32(-1, -1);
|
|
__gpu_read_first_lane_u64(-1, -1);
|
|
__gpu_ballot(-1, 1);
|
|
__gpu_sync_threads();
|
|
__gpu_sync_lane(-1);
|
|
__gpu_shuffle_idx_u32(-1, -1, -1, 0);
|
|
__gpu_first_lane_id(-1);
|
|
__gpu_is_first_in_lane(-1);
|
|
__gpu_exit();
|
|
}
|
|
// AMDGPU-LABEL: define protected amdgpu_kernel void @foo(
|
|
// AMDGPU-SAME: ) #[[ATTR0:[0-9]+]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR7:[0-9]+]]
|
|
// AMDGPU-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR7]]
|
|
// AMDGPU-NEXT: call void @__gpu_sync_threads() #[[ATTR7]]
|
|
// AMDGPU-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR7]]
|
|
// AMDGPU-NEXT: call void @__gpu_exit() #[[ATTR8:[0-9]+]]
|
|
// AMDGPU-NEXT: unreachable
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks_x(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 12
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3:![0-9]+]], !invariant.load [[META4:![0-9]+]]
|
|
// AMDGPU-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
// AMDGPU-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 12
|
|
// AMDGPU-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG5:![0-9]+]], !invariant.load [[META4]], !noundef [[META4]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP5]] to i32
|
|
// AMDGPU-NEXT: [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]]
|
|
// AMDGPU-NEXT: ret i32 [[DIV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks_y(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 16
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3]], !invariant.load [[META4]]
|
|
// AMDGPU-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
// AMDGPU-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 14
|
|
// AMDGPU-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP5]] to i32
|
|
// AMDGPU-NEXT: [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]]
|
|
// AMDGPU-NEXT: ret i32 [[DIV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks_z(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 20
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !range [[RNG3]], !invariant.load [[META4]]
|
|
// AMDGPU-NEXT: [[TMP3:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
// AMDGPU-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP3]], i32 16
|
|
// AMDGPU-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[TMP4]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP5]] to i32
|
|
// AMDGPU-NEXT: [[DIV:%.*]] = udiv i32 [[TMP2]], [[CONV]]
|
|
// AMDGPU-NEXT: ret i32 [[DIV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_blocks(
|
|
// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// AMDGPU-NEXT: ]
|
|
// AMDGPU: [[SW_BB]]:
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN:.*]]
|
|
// AMDGPU: [[SW_BB1]]:
|
|
// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_BB3]]:
|
|
// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_DEFAULT]]:
|
|
// AMDGPU-NEXT: unreachable
|
|
// AMDGPU: [[RETURN]]:
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_block_id_x(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_block_id_y(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_block_id_z(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_block_id(
|
|
// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// AMDGPU-NEXT: ]
|
|
// AMDGPU: [[SW_BB]]:
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN:.*]]
|
|
// AMDGPU: [[SW_BB1]]:
|
|
// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_BB3]]:
|
|
// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_DEFAULT]]:
|
|
// AMDGPU-NEXT: unreachable
|
|
// AMDGPU: [[RETURN]]:
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_threads_x(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 12
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP2]] to i32
|
|
// AMDGPU-NEXT: ret i32 [[CONV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_threads_y(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 14
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP2]] to i32
|
|
// AMDGPU-NEXT: ret i32 [[CONV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_threads_z(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr addrspace(4) [[TMP0]], i32 16
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i16, ptr addrspace(4) [[TMP1]], align 2, !range [[RNG5]], !invariant.load [[META4]], !noundef [[META4]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i16 [[TMP2]] to i32
|
|
// AMDGPU-NEXT: ret i32 [[CONV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_threads(
|
|
// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// AMDGPU-NEXT: ]
|
|
// AMDGPU: [[SW_BB]]:
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN:.*]]
|
|
// AMDGPU: [[SW_BB1]]:
|
|
// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_BB3]]:
|
|
// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_DEFAULT]]:
|
|
// AMDGPU-NEXT: unreachable
|
|
// AMDGPU: [[RETURN]]:
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_thread_id_x(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_thread_id_y(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_thread_id_z(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_thread_id(
|
|
// AMDGPU-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__DIM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__DIM_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// AMDGPU-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// AMDGPU-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// AMDGPU-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// AMDGPU-NEXT: ]
|
|
// AMDGPU: [[SW_BB]]:
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN:.*]]
|
|
// AMDGPU: [[SW_BB1]]:
|
|
// AMDGPU-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL2]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_BB3]]:
|
|
// AMDGPU-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR7]]
|
|
// AMDGPU-NEXT: store i32 [[CALL4]], ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: br label %[[RETURN]]
|
|
// AMDGPU: [[SW_DEFAULT]]:
|
|
// AMDGPU-NEXT: unreachable
|
|
// AMDGPU: [[RETURN]]:
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL_ASCAST]], align 4
|
|
// AMDGPU-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_num_lanes(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.wavefrontsize()
|
|
// AMDGPU-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_lane_id(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TMP0]])
|
|
// AMDGPU-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i64 @__gpu_lane_mask(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
// AMDGPU-NEXT: ret i64 [[TMP0]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_read_first_lane_u32(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: store i32 [[__X]], ptr [[__X_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP0]])
|
|
// AMDGPU-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i64 @__gpu_read_first_lane_u64(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__HI:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LO:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__HI_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__HI]] to ptr
|
|
// AMDGPU-NEXT: [[__LO_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LO]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: store i64 [[__X]], ptr [[__X_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
|
|
// AMDGPU-NEXT: store i32 [[CONV]], ptr [[__HI_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
|
|
// AMDGPU-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
|
|
// AMDGPU-NEXT: store i32 [[CONV1]], ptr [[__LO_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
|
|
// AMDGPU-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
|
|
// AMDGPU-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
|
|
// AMDGPU-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
|
|
// AMDGPU-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
|
|
// AMDGPU-NEXT: ret i64 [[OR]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i64 @__gpu_ballot(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i8, align 1, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[STOREDV:%.*]] = zext i1 [[__X]] to i8
|
|
// AMDGPU-NEXT: store i8 [[STOREDV]], ptr [[__X_ADDR_ASCAST]], align 1
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i8, ptr [[__X_ADDR_ASCAST]], align 1
|
|
// AMDGPU-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP1]] to i1
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[LOADEDV]])
|
|
// AMDGPU-NEXT: [[AND:%.*]] = and i64 [[TMP0]], [[TMP2]]
|
|
// AMDGPU-NEXT: ret i64 [[AND]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal void @__gpu_sync_threads(
|
|
// AMDGPU-SAME: ) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: call void @llvm.amdgcn.s.barrier()
|
|
// AMDGPU-NEXT: fence syncscope("workgroup") seq_cst
|
|
// AMDGPU-NEXT: ret void
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal void @__gpu_sync_lane(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: call void @llvm.amdgcn.wave.barrier()
|
|
// AMDGPU-NEXT: ret void
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i32 @__gpu_shuffle_idx_u32(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__IDX:%.*]], i32 noundef [[__X:%.*]], i32 noundef [[__WIDTH:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__IDX_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__WIDTH_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE:%.*]] = alloca i32, align 4, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__IDX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__IDX_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__X_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__WIDTH_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__WIDTH_ADDR]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: store i32 [[__IDX]], ptr [[__IDX_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: store i32 [[__X]], ptr [[__X_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: store i32 [[__WIDTH]], ptr [[__WIDTH_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i32, ptr [[__IDX_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = load i32, ptr [[__WIDTH_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[SUB:%.*]] = sub i32 [[TMP1]], 1
|
|
// AMDGPU-NEXT: [[NOT:%.*]] = xor i32 [[SUB]], -1
|
|
// AMDGPU-NEXT: [[AND:%.*]] = and i32 [[CALL]], [[NOT]]
|
|
// AMDGPU-NEXT: [[ADD:%.*]] = add i32 [[TMP0]], [[AND]]
|
|
// AMDGPU-NEXT: store i32 [[ADD]], ptr [[__LANE_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = load i32, ptr [[__LANE_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[SHL:%.*]] = shl i32 [[TMP2]], 2
|
|
// AMDGPU-NEXT: [[TMP3:%.*]] = load i32, ptr [[__X_ADDR_ASCAST]], align 4
|
|
// AMDGPU-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.ds.bpermute(i32 [[SHL]], i32 [[TMP3]])
|
|
// AMDGPU-NEXT: ret i32 [[TMP4]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal i64 @__gpu_first_lane_id(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
|
|
// AMDGPU-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1
|
|
// AMDGPU-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
|
|
// AMDGPU-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
|
|
// AMDGPU-NEXT: [[CAST:%.*]] = trunc i64 [[FFS]] to i32
|
|
// AMDGPU-NEXT: [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64
|
|
// AMDGPU-NEXT: ret i64 [[CONV]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal zeroext i1 @__gpu_is_first_in_lane(
|
|
// AMDGPU-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5)
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
|
|
// AMDGPU-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr
|
|
// AMDGPU-NEXT: [[__LANE_MASK_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__LANE_MASK_ADDR]] to ptr
|
|
// AMDGPU-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CONV:%.*]] = zext i32 [[CALL]] to i64
|
|
// AMDGPU-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR_ASCAST]], align 8
|
|
// AMDGPU-NEXT: [[CALL1:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef [[TMP0]]) #[[ATTR7]]
|
|
// AMDGPU-NEXT: [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
|
|
// AMDGPU-NEXT: ret i1 [[CMP]]
|
|
//
|
|
//
|
|
// AMDGPU-LABEL: define internal void @__gpu_exit(
|
|
// AMDGPU-SAME: ) #[[ATTR1:[0-9]+]] {
|
|
// AMDGPU-NEXT: [[ENTRY:.*:]]
|
|
// AMDGPU-NEXT: call void @llvm.amdgcn.endpgm()
|
|
// AMDGPU-NEXT: ret void
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define protected ptx_kernel void @foo(
|
|
// NVPTX-SAME: ) #[[ATTR0:[0-9]+]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6:[0-9]+]]
|
|
// NVPTX-NEXT: [[CALL1:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_num_blocks(i32 noundef 0) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL5:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL6:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL7:%.*]] = call i32 @__gpu_block_id(i32 noundef 0) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL8:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL9:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL10:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL11:%.*]] = call i32 @__gpu_num_threads(i32 noundef 0) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL12:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL13:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL14:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL15:%.*]] = call i32 @__gpu_thread_id(i32 noundef 0) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL16:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL17:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL18:%.*]] = call i64 @__gpu_lane_mask() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL19:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef -1, i32 noundef -1) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL20:%.*]] = call i64 @__gpu_read_first_lane_u64(i64 noundef -1, i64 noundef -1) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL21:%.*]] = call i64 @__gpu_ballot(i64 noundef -1, i1 noundef zeroext true) #[[ATTR6]]
|
|
// NVPTX-NEXT: call void @__gpu_sync_threads() #[[ATTR6]]
|
|
// NVPTX-NEXT: call void @__gpu_sync_lane(i64 noundef -1) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL22:%.*]] = call i32 @__gpu_shuffle_idx_u32(i64 noundef -1, i32 noundef -1, i32 noundef -1, i32 noundef 0) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL23:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef -1) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CALL24:%.*]] = call zeroext i1 @__gpu_is_first_in_lane(i64 noundef -1) #[[ATTR6]]
|
|
// NVPTX-NEXT: call void @__gpu_exit() #[[ATTR7:[0-9]+]]
|
|
// NVPTX-NEXT: unreachable
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_blocks_x(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_blocks_y(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_blocks_z(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_blocks(
|
|
// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// NVPTX-NEXT: ]
|
|
// NVPTX: [[SW_BB]]:
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_blocks_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN:.*]]
|
|
// NVPTX: [[SW_BB1]]:
|
|
// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_blocks_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_BB3]]:
|
|
// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_blocks_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_DEFAULT]]:
|
|
// NVPTX-NEXT: unreachable
|
|
// NVPTX: [[RETURN]]:
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_block_id_x(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_block_id_y(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_block_id_z(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_block_id(
|
|
// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// NVPTX-NEXT: ]
|
|
// NVPTX: [[SW_BB]]:
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_block_id_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN:.*]]
|
|
// NVPTX: [[SW_BB1]]:
|
|
// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_block_id_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_BB3]]:
|
|
// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_block_id_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_DEFAULT]]:
|
|
// NVPTX-NEXT: unreachable
|
|
// NVPTX: [[RETURN]]:
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_threads_x(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_threads_y(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_threads_z(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_threads(
|
|
// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// NVPTX-NEXT: ]
|
|
// NVPTX: [[SW_BB]]:
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_threads_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN:.*]]
|
|
// NVPTX: [[SW_BB1]]:
|
|
// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_num_threads_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_BB3]]:
|
|
// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_num_threads_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_DEFAULT]]:
|
|
// NVPTX-NEXT: unreachable
|
|
// NVPTX: [[RETURN]]:
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_thread_id_x(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_thread_id_y(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_thread_id_z(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_thread_id(
|
|
// NVPTX-SAME: i32 noundef [[__DIM:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__DIM_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i32 [[__DIM]], ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i32, ptr [[__DIM_ADDR]], align 4
|
|
// NVPTX-NEXT: switch i32 [[TMP0]], label %[[SW_DEFAULT:.*]] [
|
|
// NVPTX-NEXT: i32 0, label %[[SW_BB:.*]]
|
|
// NVPTX-NEXT: i32 1, label %[[SW_BB1:.*]]
|
|
// NVPTX-NEXT: i32 2, label %[[SW_BB3:.*]]
|
|
// NVPTX-NEXT: ]
|
|
// NVPTX: [[SW_BB]]:
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_thread_id_x() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN:.*]]
|
|
// NVPTX: [[SW_BB1]]:
|
|
// NVPTX-NEXT: [[CALL2:%.*]] = call i32 @__gpu_thread_id_y() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL2]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_BB3]]:
|
|
// NVPTX-NEXT: [[CALL4:%.*]] = call i32 @__gpu_thread_id_z() #[[ATTR6]]
|
|
// NVPTX-NEXT: store i32 [[CALL4]], ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: br label %[[RETURN]]
|
|
// NVPTX: [[SW_DEFAULT]]:
|
|
// NVPTX-NEXT: unreachable
|
|
// NVPTX: [[RETURN]]:
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[RETVAL]], align 4
|
|
// NVPTX-NEXT: ret i32 [[TMP1]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_num_lanes(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_lane_id(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.laneid()
|
|
// NVPTX-NEXT: ret i32 [[TMP0]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i64 @__gpu_lane_mask(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = call i32 @llvm.nvvm.activemask()
|
|
// NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[TMP0]] to i64
|
|
// NVPTX-NEXT: ret i64 [[CONV]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_read_first_lane_u32(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__X:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__MASK:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__ID:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
|
|
// NVPTX-NEXT: store i32 [[CONV]], ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP1]], i1 true)
|
|
// NVPTX-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1
|
|
// NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i32 [[TMP1]], 0
|
|
// NVPTX-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i32 0, i32 [[TMP3]]
|
|
// NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[FFS]], 1
|
|
// NVPTX-NEXT: store i32 [[SUB]], ptr [[__ID]], align 4
|
|
// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__X_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[__ID]], align 4
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[SUB1:%.*]] = sub i32 [[CALL]], 1
|
|
// NVPTX-NEXT: [[TMP7:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP4]], i32 [[TMP5]], i32 [[TMP6]], i32 [[SUB1]])
|
|
// NVPTX-NEXT: ret i32 [[TMP7]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i64 @__gpu_read_first_lane_u64(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i64 noundef [[__X:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: [[__HI:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__LO:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: store i64 [[__X]], ptr [[__X_ADDR]], align 8
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__X_ADDR]], align 8
|
|
// NVPTX-NEXT: [[SHR:%.*]] = lshr i64 [[TMP0]], 32
|
|
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[SHR]] to i32
|
|
// NVPTX-NEXT: store i32 [[CONV]], ptr [[__HI]], align 4
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i64, ptr [[__X_ADDR]], align 8
|
|
// NVPTX-NEXT: [[AND:%.*]] = and i64 [[TMP1]], 4294967295
|
|
// NVPTX-NEXT: [[CONV1:%.*]] = trunc i64 [[AND]] to i32
|
|
// NVPTX-NEXT: store i32 [[CONV1]], ptr [[__LO]], align 4
|
|
// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[__HI]], align 4
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP2]], i32 noundef [[TMP3]]) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CONV2:%.*]] = zext i32 [[CALL]] to i64
|
|
// NVPTX-NEXT: [[SHL:%.*]] = shl i64 [[CONV2]], 32
|
|
// NVPTX-NEXT: [[TMP4:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__LO]], align 4
|
|
// NVPTX-NEXT: [[CALL3:%.*]] = call i32 @__gpu_read_first_lane_u32(i64 noundef [[TMP4]], i32 noundef [[TMP5]]) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CONV4:%.*]] = zext i32 [[CALL3]] to i64
|
|
// NVPTX-NEXT: [[AND5:%.*]] = and i64 [[CONV4]], 4294967295
|
|
// NVPTX-NEXT: [[OR:%.*]] = or i64 [[SHL]], [[AND5]]
|
|
// NVPTX-NEXT: ret i64 [[OR]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i64 @__gpu_ballot(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i1 noundef zeroext [[__X:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i8, align 1
|
|
// NVPTX-NEXT: [[__MASK:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[__X]] to i8
|
|
// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[__X_ADDR]], align 1
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
|
|
// NVPTX-NEXT: store i32 [[CONV]], ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP2:%.*]] = load i8, ptr [[__X_ADDR]], align 1
|
|
// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP2]] to i1
|
|
// NVPTX-NEXT: [[TMP3:%.*]] = call i32 @llvm.nvvm.vote.ballot.sync(i32 [[TMP1]], i1 [[LOADEDV]])
|
|
// NVPTX-NEXT: [[CONV1:%.*]] = zext i32 [[TMP3]] to i64
|
|
// NVPTX-NEXT: ret i64 [[CONV1]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal void @__gpu_sync_threads(
|
|
// NVPTX-SAME: ) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
|
|
// NVPTX-NEXT: ret void
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal void @__gpu_sync_lane(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
|
|
// NVPTX-NEXT: call void @llvm.nvvm.bar.warp.sync(i32 [[CONV]])
|
|
// NVPTX-NEXT: ret void
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i32 @__gpu_shuffle_idx_u32(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]], i32 noundef [[__IDX:%.*]], i32 noundef [[__X:%.*]], i32 noundef [[__WIDTH:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: [[__IDX_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__X_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__WIDTH_ADDR:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__MASK:%.*]] = alloca i32, align 4
|
|
// NVPTX-NEXT: [[__BITMASK:%.*]] = alloca i8, align 1
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: store i32 [[__IDX]], ptr [[__IDX_ADDR]], align 4
|
|
// NVPTX-NEXT: store i32 [[__X]], ptr [[__X_ADDR]], align 4
|
|
// NVPTX-NEXT: store i32 [[__WIDTH]], ptr [[__WIDTH_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[CONV:%.*]] = trunc i64 [[TMP0]] to i32
|
|
// NVPTX-NEXT: store i32 [[CONV]], ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[__IDX_ADDR]], align 4
|
|
// NVPTX-NEXT: [[SH_PROM:%.*]] = zext i32 [[TMP1]] to i64
|
|
// NVPTX-NEXT: [[SHL:%.*]] = shl i64 1, [[SH_PROM]]
|
|
// NVPTX-NEXT: [[TMP2:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[AND:%.*]] = and i64 [[SHL]], [[TMP2]]
|
|
// NVPTX-NEXT: [[TOBOOL:%.*]] = icmp ne i64 [[AND]], 0
|
|
// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TOBOOL]] to i8
|
|
// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[__BITMASK]], align 1
|
|
// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[__BITMASK]], align 1
|
|
// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1
|
|
// NVPTX-NEXT: [[CONV1:%.*]] = zext i1 [[LOADEDV]] to i32
|
|
// NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 0, [[CONV1]]
|
|
// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[__MASK]], align 4
|
|
// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[__X_ADDR]], align 4
|
|
// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[__IDX_ADDR]], align 4
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_num_lanes() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[__WIDTH_ADDR]], align 4
|
|
// NVPTX-NEXT: [[SUB2:%.*]] = sub i32 [[CALL]], [[TMP7]]
|
|
// NVPTX-NEXT: [[SHL3:%.*]] = shl i32 [[SUB2]], 8
|
|
// NVPTX-NEXT: [[OR:%.*]] = or i32 [[SHL3]], 31
|
|
// NVPTX-NEXT: [[TMP8:%.*]] = call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 [[TMP4]], i32 [[TMP5]], i32 [[TMP6]], i32 [[OR]])
|
|
// NVPTX-NEXT: [[AND4:%.*]] = and i32 [[SUB]], [[TMP8]]
|
|
// NVPTX-NEXT: ret i32 [[AND4]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal i64 @__gpu_first_lane_id(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[TMP1:%.*]] = call i64 @llvm.cttz.i64(i64 [[TMP0]], i1 true)
|
|
// NVPTX-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], 1
|
|
// NVPTX-NEXT: [[ISZERO:%.*]] = icmp eq i64 [[TMP0]], 0
|
|
// NVPTX-NEXT: [[FFS:%.*]] = select i1 [[ISZERO]], i64 0, i64 [[TMP2]]
|
|
// NVPTX-NEXT: [[CAST:%.*]] = trunc i64 [[FFS]] to i32
|
|
// NVPTX-NEXT: [[SUB:%.*]] = sub nsw i32 [[CAST]], 1
|
|
// NVPTX-NEXT: [[CONV:%.*]] = sext i32 [[SUB]] to i64
|
|
// NVPTX-NEXT: ret i64 [[CONV]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal zeroext i1 @__gpu_is_first_in_lane(
|
|
// NVPTX-SAME: i64 noundef [[__LANE_MASK:%.*]]) #[[ATTR0]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: [[__LANE_MASK_ADDR:%.*]] = alloca i64, align 8
|
|
// NVPTX-NEXT: store i64 [[__LANE_MASK]], ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[CALL:%.*]] = call i32 @__gpu_lane_id() #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CONV:%.*]] = zext i32 [[CALL]] to i64
|
|
// NVPTX-NEXT: [[TMP0:%.*]] = load i64, ptr [[__LANE_MASK_ADDR]], align 8
|
|
// NVPTX-NEXT: [[CALL1:%.*]] = call i64 @__gpu_first_lane_id(i64 noundef [[TMP0]]) #[[ATTR6]]
|
|
// NVPTX-NEXT: [[CMP:%.*]] = icmp eq i64 [[CONV]], [[CALL1]]
|
|
// NVPTX-NEXT: ret i1 [[CMP]]
|
|
//
|
|
//
|
|
// NVPTX-LABEL: define internal void @__gpu_exit(
|
|
// NVPTX-SAME: ) #[[ATTR1:[0-9]+]] {
|
|
// NVPTX-NEXT: [[ENTRY:.*:]]
|
|
// NVPTX-NEXT: call void @llvm.nvvm.exit()
|
|
// NVPTX-NEXT: ret void
|
|
//
|
|
//.
|
|
// AMDGPU: [[RNG3]] = !{i32 1, i32 0}
|
|
// AMDGPU: [[META4]] = !{}
|
|
// AMDGPU: [[RNG5]] = !{i16 1, i16 1025}
|
|
//.
|