llvm-project/llvm/test/CodeGen/AMDGPU/global-atomic-scan.ll
Vikram Hegde 5feb32ba92
[AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (#89217)
This patch is intended to be the first of a series with end goal to
adapt atomic optimizer pass to support i64 and f64 operations (along
with removing all unnecessary bitcasts). This legalizes 64 bit readlane,
writelane and readfirstlane ops pre-ISel

---------

Co-authored-by: vikramRH <vikhegde@amd.com>
2024-06-25 14:35:19 +05:30

1441 lines
71 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s
define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_add_i32_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_add_i32_max_neg_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_add_i32_soffset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_add_i32_huge_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_add_i32_ret_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_add_i32_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_add_i32_ret_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_add_i32(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_add_i32_ret(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_add_i32_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_add_i32_ret_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_and_i32_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_and_i32_ret_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_and_i32_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_and_i32_ret_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_and_i32(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_and_i32_ret(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_and_i32_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_and_i32_ret_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_sub_i32_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_sub_i32_ret_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_sub_i32_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_sub_i32_ret_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_sub_i32(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_sub_i32_ret(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_sub_i32_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_sub_i32_ret_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
; IR: 10:
; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
; IR-NEXT: br label [[TMP12]]
; IR: 12:
; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP13]])
; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_max_i32_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_max_i32_ret_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_max_i32_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_max_i32_ret_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_max_i32(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_max_i32_ret(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_max_i32_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_max_i32_ret_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_umax_i32_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_umax_i32_ret_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_umax_i32_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_umax_i32_ret_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_umax_i32(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_umax_i32_ret(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_umax_i32_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_umax_i32_ret_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_min_i32_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_min_i32_ret_offset(
; IR-NEXT: entry:
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%gep = getelementptr i32, ptr addrspace(1) %out, i64 4
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_min_i32_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_min_i32_ret_addr64_offset(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
%val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
; IR-LABEL: @atomic_min_i32(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
; IR-LABEL: @atomic_min_i32_ret(
; IR-NEXT: entry:
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}
define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
; IR-LABEL: @atomic_min_i32_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
ret void
}
define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
; IR-LABEL: @atomic_min_i32_ret_addr64(
; IR-NEXT: entry:
; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
; IR: 7:
; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
; IR-NEXT: br label [[TMP9]]
; IR: 9:
; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[TMP10]])
; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
; IR-NEXT: ret void
;
entry:
%ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
%val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
store i32 %val, ptr addrspace(1) %out2
ret void
}