
This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This legalizes 64 bit readlane, writelane and readfirstlane ops pre-ISel --------- Co-authored-by: vikramRH <vikhegde@amd.com>
1441 lines
71 KiB
LLVM
1441 lines
71 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32_max_neg_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32_soffset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32_huge_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
|
|
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32_ret_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_add_i32_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_add_i32_ret_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_add_i32_ret(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_add_i32_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_add_i32_ret_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_and_i32_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_and_i32_ret_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_and_i32_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_and_i32_ret_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_and_i32(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_and_i32_ret(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_and_i32_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_and_i32_ret_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_sub_i32_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_sub_i32_ret_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_sub_i32_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_sub_i32_ret_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_sub_i32(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_sub_i32_ret(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_sub_i32_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_sub_i32_ret_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
|
|
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
|
|
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
|
|
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
|
|
; IR: 10:
|
|
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP12]]
|
|
; IR: 12:
|
|
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
|
|
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
|
|
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
|
|
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
|
|
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_max_i32_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_max_i32_ret_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_max_i32_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_max_i32_ret_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_max_i32(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_max_i32_ret(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_max_i32_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_max_i32_ret_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_umax_i32_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_umax_i32_ret_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_umax_i32_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_umax_i32_ret_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_umax_i32(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_umax_i32_ret(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_umax_i32_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_umax_i32_ret_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_min_i32_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_min_i32_ret_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
|
|
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_min_i32_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_min_i32_ret_addr64_offset(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
|
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
|
|
; IR-LABEL: @atomic_min_i32(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
|
|
; IR-LABEL: @atomic_min_i32_ret(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_min_i32_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
|
|
; IR-LABEL: @atomic_min_i32_ret_addr64(
|
|
; IR-NEXT: entry:
|
|
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
|
|
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
|
|
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
|
|
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
|
|
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
|
|
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
|
|
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
|
|
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
|
|
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
|
|
; IR: 7:
|
|
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
|
|
; IR-NEXT: br label [[TMP9]]
|
|
; IR: 9:
|
|
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
|
|
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
|
|
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
|
|
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
|
|
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
|
|
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
|
|
; IR-NEXT: ret void
|
|
;
|
|
entry:
|
|
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
|
|
%val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
|
|
store i32 %val, ptr addrspace(1) %out2
|
|
ret void
|
|
}
|