From d8a74676d2d1e778ea960a904db2eb29613c4c08 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 11 Mar 2026 07:53:57 -0500 Subject: [PATCH] [NVPTX] Support __scoped_atomic_* operations in NVPTX (#184737) Summary: All the infrastructure for this is here, it's just no one's turned it on. --- clang/lib/CodeGen/Targets/NVPTX.cpp | 33 + clang/test/CodeGen/scoped-atomic-ops.c | 1549 ++++++++++++++++++++++-- 2 files changed, 1452 insertions(+), 130 deletions(-) diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp index ba2acd821c70..50e116765b89 100644 --- a/clang/lib/CodeGen/Targets/NVPTX.cpp +++ b/clang/lib/CodeGen/Targets/NVPTX.cpp @@ -8,6 +8,7 @@ #include "ABIInfoImpl.h" #include "TargetInfo.h" +#include "clang/Basic/SyncScope.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/IR/CallingConv.h" @@ -50,6 +51,9 @@ public: CodeGen::CodeGenModule &M) const override; bool shouldEmitStaticExternCAliases() const override; + StringRef getLLVMSyncScopeStr(const LangOptions &LangOpts, SyncScope Scope, + llvm::AtomicOrdering Ordering) const override; + llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *T, QualType QT) const override; @@ -299,6 +303,35 @@ bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const { return false; } +StringRef NVPTXTargetCodeGenInfo::getLLVMSyncScopeStr( + const LangOptions &LangOpts, SyncScope Scope, + llvm::AtomicOrdering Ordering) const { + switch (Scope) { + case SyncScope::HIPSingleThread: + case SyncScope::SingleScope: + return "singlethread"; + case SyncScope::HIPWavefront: + case SyncScope::OpenCLSubGroup: + case SyncScope::WavefrontScope: + case SyncScope::HIPWorkgroup: + case SyncScope::OpenCLWorkGroup: + case SyncScope::WorkgroupScope: + return "block"; + case SyncScope::HIPCluster: + case SyncScope::ClusterScope: + return "cluster"; + case SyncScope::HIPAgent: + case SyncScope::OpenCLDevice: + case SyncScope::DeviceScope: + return "device"; + case SyncScope::SystemScope: + case SyncScope::HIPSystem: + case SyncScope::OpenCLAllSVMDevices: + return ""; + } + llvm_unreachable("Unknown SyncScope enum"); +} + llvm::Constant * NVPTXTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT, diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c index 3fbaf75cf98e..6df0d439d6b8 100644 --- a/clang/test/CodeGen/scoped-atomic-ops.c +++ b/clang/test/CodeGen/scoped-atomic-ops.c @@ -3,6 +3,8 @@ // RUN: -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \ // RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s +// RUN: %clang_cc1 %s -emit-llvm -o - -triple=nvptx64-nvidia-cuda -ffreestanding \ +// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=NVPTX %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \ // RUN: -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s @@ -63,6 +65,33 @@ // AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[V]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP12]] // +// NVPTX-LABEL: define hidden i32 @fi1a( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[V:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP1]], ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP5]], ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP9]], ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP11]], ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: ret i32 [[TMP12]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi1a( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -159,6 +188,57 @@ int fi1a(int *i) { // AMDGCN-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4 // AMDGCN-NEXT: ret i32 [[TMP25]] // +// NVPTX-LABEL: define hidden i32 @fi1b( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4 +// NVPTX-NEXT: ret i32 [[TMP25]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi1b( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -283,6 +363,33 @@ int fi1b(int *i) { // AMDGCN_CL_20-NEXT: store atomic i32 [[TMP17]], ptr [[TMP15]] syncscope("singlethread") monotonic, align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi2a( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[V:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[V]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP9:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[V]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi2a( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -364,6 +471,43 @@ void fi2a(int *i) { // AMDGCN-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 // AMDGCN-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi2b( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP2]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP4]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi2b( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -472,7 +616,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2:![0-9]+]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -480,7 +624,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -488,7 +632,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -496,7 +640,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -504,7 +648,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -512,7 +656,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -520,7 +664,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -528,7 +672,7 @@ void fi2b(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 @@ -597,7 +741,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -605,7 +749,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -613,7 +757,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -621,7 +765,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -629,7 +773,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -637,7 +781,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -645,7 +789,7 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -653,13 +797,114 @@ void fi2b(int *i) { // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi3a( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// NVPTX-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// NVPTX-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// NVPTX-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3a( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -834,7 +1079,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -842,7 +1087,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -850,7 +1095,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -858,7 +1103,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -866,7 +1111,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -874,7 +1119,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -882,7 +1127,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -890,7 +1135,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 @@ -959,7 +1204,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -967,7 +1212,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -975,7 +1220,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -983,7 +1228,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -991,7 +1236,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -999,7 +1244,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -1007,7 +1252,7 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -1015,13 +1260,114 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi3b( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// NVPTX-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// NVPTX-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// NVPTX-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3b( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -1196,7 +1542,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -1204,7 +1550,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -1212,7 +1558,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -1220,7 +1566,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -1228,7 +1574,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -1236,7 +1582,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -1244,7 +1590,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -1252,7 +1598,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 @@ -1321,7 +1667,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -1329,7 +1675,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -1337,7 +1683,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -1345,7 +1691,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -1353,7 +1699,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -1361,7 +1707,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -1369,7 +1715,7 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -1377,13 +1723,114 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi3c( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// NVPTX-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// NVPTX-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// NVPTX-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3c( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -1558,7 +2005,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -1566,7 +2013,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -1574,7 +2021,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -1582,7 +2029,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -1590,7 +2037,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -1598,7 +2045,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -1606,7 +2053,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -1614,7 +2061,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 @@ -1683,7 +2130,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -1691,7 +2138,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -1699,7 +2146,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -1707,7 +2154,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -1715,7 +2162,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -1723,7 +2170,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -1731,7 +2178,7 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -1739,13 +2186,114 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi3_clustr( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// NVPTX-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// NVPTX-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// NVPTX-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3_clustr( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -1920,7 +2468,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -1928,7 +2476,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -1936,7 +2484,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -1944,7 +2492,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -1952,7 +2500,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -1960,7 +2508,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -1968,7 +2516,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -1976,7 +2524,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 @@ -2045,7 +2593,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -2053,7 +2601,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -2061,7 +2609,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -2069,7 +2617,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -2077,7 +2625,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -2085,7 +2633,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -2093,7 +2641,7 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -2101,13 +2649,114 @@ void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi3d( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// NVPTX-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// NVPTX-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// NVPTX-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3d( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -2282,7 +2931,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -2290,7 +2939,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -2298,7 +2947,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -2306,7 +2955,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -2314,7 +2963,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -2322,7 +2971,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -2330,7 +2979,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -2338,7 +2987,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 @@ -2407,7 +3056,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -2415,7 +3064,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -2423,7 +3072,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 @@ -2431,7 +3080,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 @@ -2439,7 +3088,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 @@ -2447,7 +3096,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 @@ -2455,7 +3104,7 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 @@ -2463,13 +3112,114 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi3e( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// NVPTX-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// NVPTX-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// NVPTX-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// NVPTX-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// NVPTX-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// NVPTX-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// NVPTX-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// NVPTX-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// NVPTX-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// NVPTX-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// NVPTX-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// NVPTX-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// NVPTX-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// NVPTX-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// NVPTX-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// NVPTX-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// NVPTX-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3e( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -2643,6 +3393,33 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi4a( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4a( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -2739,6 +3516,33 @@ _Bool fi4a(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi4b( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4b( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -2835,6 +3639,33 @@ _Bool fi4b(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi4c( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("block") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4c( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -2931,6 +3762,33 @@ _Bool fi4c(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi4_clustr( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4_clustr( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3027,6 +3885,33 @@ _Bool fi4_clustr(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi4d( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("block") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4d( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3123,6 +4008,33 @@ _Bool fi4d(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi4e( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4e( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3219,6 +4131,33 @@ _Bool fi4e(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi5a( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5a( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3314,6 +4253,33 @@ _Bool fi5a(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi5b( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5b( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3409,6 +4375,33 @@ _Bool fi5b(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi5c( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("block") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5c( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3503,6 +4496,33 @@ _Bool fi5c(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi5_clustr( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5_clustr( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3597,6 +4617,33 @@ _Bool fi5_clustr(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi5d( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("block") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5d( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3691,6 +4738,33 @@ _Bool fi5d(int *i) { // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi5e( +// NVPTX-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 0, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// NVPTX-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// NVPTX-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// NVPTX-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// NVPTX: [[CMPXCHG_STORE_EXPECTED]]: +// NVPTX-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// NVPTX-NEXT: br label %[[CMPXCHG_CONTINUE]] +// NVPTX: [[CMPXCHG_CONTINUE]]: +// NVPTX-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// NVPTX-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5e( // SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3738,7 +4812,7 @@ _Bool fi5e(int *i) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] @@ -3757,11 +4831,27 @@ _Bool fi5e(int *i) { // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] // +// NVPTX-LABEL: define hidden i32 @fi6a( +// NVPTX-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[RET:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// NVPTX-NEXT: ret i32 [[TMP4]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6a( // SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3798,7 +4888,7 @@ int fi6a(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] @@ -3817,11 +4907,27 @@ int fi6a(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] // +// NVPTX-LABEL: define hidden i32 @fi6b( +// NVPTX-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[RET:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// NVPTX-NEXT: ret i32 [[TMP4]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6b( // SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3858,7 +4964,7 @@ int fi6b(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] @@ -3877,11 +4983,27 @@ int fi6b(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] // +// NVPTX-LABEL: define hidden i32 @fi6c( +// NVPTX-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[RET:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// NVPTX-NEXT: ret i32 [[TMP4]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6c( // SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3918,7 +5040,7 @@ int fi6c(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] @@ -3937,11 +5059,27 @@ int fi6c(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] // +// NVPTX-LABEL: define hidden i32 @fi6_clustr( +// NVPTX-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[RET:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("cluster") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// NVPTX-NEXT: ret i32 [[TMP4]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6_clustr( // SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -3978,7 +5116,7 @@ int fi6_clustr(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] @@ -3997,11 +5135,27 @@ int fi6_clustr(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] // +// NVPTX-LABEL: define hidden i32 @fi6d( +// NVPTX-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[RET:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("block") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// NVPTX-NEXT: ret i32 [[TMP4]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6d( // SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4038,7 +5192,7 @@ int fi6d(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] @@ -4057,11 +5211,27 @@ int fi6d(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 // AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] // +// NVPTX-LABEL: define hidden i32 @fi6e( +// NVPTX-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[RET:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// NVPTX-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// NVPTX-NEXT: ret i32 [[TMP4]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6e( // SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4097,7 +5267,7 @@ int fi6e(int *c, int *d) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 @@ -4116,12 +5286,28 @@ int fi6e(int *c, int *d) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi7a( +// NVPTX-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1 +// NVPTX-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7a( // SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4156,7 +5342,7 @@ _Bool fi7a(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 @@ -4175,12 +5361,28 @@ _Bool fi7a(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi7b( +// NVPTX-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("device") monotonic, align 1 +// NVPTX-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7b( // SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4215,7 +5417,7 @@ _Bool fi7b(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 @@ -4234,12 +5436,28 @@ _Bool fi7b(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi7c( +// NVPTX-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("block") monotonic, align 1 +// NVPTX-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7c( // SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4274,7 +5492,7 @@ _Bool fi7c(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 @@ -4293,12 +5511,28 @@ _Bool fi7c(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi7_clustr( +// NVPTX-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1 +// NVPTX-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7_clustr( // SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4333,7 +5567,7 @@ _Bool fi7_clustr(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 @@ -4352,12 +5586,28 @@ _Bool fi7_clustr(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi7d( +// NVPTX-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("block") monotonic, align 1 +// NVPTX-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7d( // SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4392,7 +5642,7 @@ _Bool fi7d(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 @@ -4411,12 +5661,28 @@ _Bool fi7d(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 // AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 // AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] // +// NVPTX-LABEL: define hidden zeroext i1 @fi7e( +// NVPTX-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// NVPTX-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// NVPTX-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1 +// NVPTX-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// NVPTX-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// NVPTX-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7e( // SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4458,7 +5724,7 @@ _Bool fi7e(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -4466,7 +5732,7 @@ _Bool fi7e(_Bool *c) { // AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_DEF-NEXT: store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META2]], !amdgpu.no.remote.memory [[META2]] // AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 @@ -4493,7 +5759,7 @@ _Bool fi7e(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 -1, ptr [[DOTATOMICTMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 @@ -4501,13 +5767,42 @@ _Bool fi7e(_Bool *c) { // AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 -1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 -// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] // AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 // AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 // AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 // AMDGCN_CL_20-NEXT: ret void // +// NVPTX-LABEL: define hidden void @fi8a( +// NVPTX-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { +// NVPTX-NEXT: [[ENTRY:.*:]] +// NVPTX-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// NVPTX-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// NVPTX-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 -1, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// NVPTX-NEXT: [[TMP2:%.*]] = atomicrmw uinc_wrap ptr [[TMP0]], i32 [[TMP1]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// NVPTX-NEXT: [[TMP4:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// NVPTX-NEXT: [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 -1, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// NVPTX-NEXT: [[TMP7:%.*]] = atomicrmw udec_wrap ptr [[TMP5]], i32 [[TMP6]] syncscope("device") monotonic, align 4 +// NVPTX-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// NVPTX-NEXT: [[TMP9:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// NVPTX-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// NVPTX-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi8a( // SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR0]] { // SPIRV-NEXT: [[ENTRY:.*:]] @@ -4541,9 +5836,3 @@ void fi8a(unsigned int *a, unsigned int *b) { *b = __scoped_atomic_fetch_uinc(b, ~0U, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); *a = __scoped_atomic_fetch_udec(a, ~0U, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); } - -//. -// AMDGCN_CL_DEF: [[META3]] = !{} -//. -// AMDGCN_CL_20: [[META4]] = !{} -//.