clang/AMDGPU: Emit atomicrmw from ds_fadd builtins (#95395)
We should have done this for the f32/f64 case a long time ago. Now that codegen handles atomicrmw selection for the v2f16/v2bf16 case, start emitting it instead. This also does upgrade the behavior to respect a volatile qualified pointer, which was previously ignored (for the cases that don't have an explicit volatile argument).
This commit is contained in:
parent
f80bd9b8a8
commit
76894c5e6e
@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
|
||||
break;
|
||||
}
|
||||
|
||||
// Some of the atomic builtins take the scope as a string name.
|
||||
StringRef scp;
|
||||
llvm::getConstantStringInfo(Scope, scp);
|
||||
SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
|
||||
if (llvm::getConstantStringInfo(Scope, scp)) {
|
||||
SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
|
||||
return;
|
||||
}
|
||||
|
||||
// Older builtins had an enum argument for the memory scope.
|
||||
int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
|
||||
switch (scope) {
|
||||
case 0: // __MEMORY_SCOPE_SYSTEM
|
||||
SSID = llvm::SyncScope::System;
|
||||
break;
|
||||
case 1: // __MEMORY_SCOPE_DEVICE
|
||||
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
|
||||
break;
|
||||
case 2: // __MEMORY_SCOPE_WRKGRP
|
||||
SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
|
||||
break;
|
||||
case 3: // __MEMORY_SCOPE_WVFRNT
|
||||
SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
|
||||
break;
|
||||
case 4: // __MEMORY_SCOPE_SINGLE
|
||||
SSID = llvm::SyncScope::SingleThread;
|
||||
break;
|
||||
default:
|
||||
SSID = llvm::SyncScope::System;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
|
||||
@ -18558,14 +18584,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
|
||||
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
|
||||
Intrinsic::ID Intrin;
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
|
||||
Intrin = Intrinsic::amdgcn_ds_fadd;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
|
||||
Intrin = Intrinsic::amdgcn_ds_fmin;
|
||||
break;
|
||||
@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
|
||||
return Builder.CreateCall(F, {Addr, Val});
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
|
||||
Intrinsic::ID IID;
|
||||
llvm::Type *ArgTy;
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
|
||||
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
|
||||
IID = Intrinsic::amdgcn_ds_fadd;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
|
||||
ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
|
||||
IID = Intrinsic::amdgcn_ds_fadd;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
|
||||
ArgTy = llvm::FixedVectorType::get(
|
||||
llvm::Type::getHalfTy(getLLVMContext()), 2);
|
||||
IID = Intrinsic::amdgcn_ds_fadd;
|
||||
break;
|
||||
}
|
||||
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
|
||||
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
|
||||
llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
|
||||
llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
|
||||
llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
|
||||
llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
|
||||
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
|
||||
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
|
||||
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
|
||||
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
|
||||
@ -19044,7 +19037,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
|
||||
llvm::AtomicRMWInst::BinOp BinOp;
|
||||
switch (BuiltinID) {
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
|
||||
@ -19055,23 +19053,54 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
|
||||
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
|
||||
BinOp = llvm::AtomicRMWInst::UDecWrap;
|
||||
break;
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
|
||||
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
|
||||
BinOp = llvm::AtomicRMWInst::FAdd;
|
||||
break;
|
||||
}
|
||||
|
||||
Address Ptr = CheckAtomicAlignment(*this, E);
|
||||
Value *Val = EmitScalarExpr(E->getArg(1));
|
||||
|
||||
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
|
||||
EmitScalarExpr(E->getArg(3)), AO, SSID);
|
||||
|
||||
llvm::Type *OrigTy = Val->getType();
|
||||
QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
|
||||
bool Volatile =
|
||||
PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
|
||||
|
||||
bool Volatile;
|
||||
|
||||
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
|
||||
// __builtin_amdgcn_ds_faddf has an explicit volatile argument
|
||||
Volatile =
|
||||
cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
|
||||
} else {
|
||||
// Infer volatile from the passed type.
|
||||
Volatile =
|
||||
PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
|
||||
}
|
||||
|
||||
if (E->getNumArgs() >= 4) {
|
||||
// Some of the builtins have explicit ordering and scope arguments.
|
||||
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
|
||||
EmitScalarExpr(E->getArg(3)), AO, SSID);
|
||||
} else {
|
||||
// The ds_fadd_* builtins do not have syncscope/order arguments.
|
||||
SSID = llvm::SyncScope::System;
|
||||
AO = AtomicOrdering::SequentiallyConsistent;
|
||||
|
||||
// The v2bf16 builtin uses i16 instead of a natural bfloat type.
|
||||
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
|
||||
llvm::Type *V2BF16Ty = FixedVectorType::get(
|
||||
llvm::Type::getBFloatTy(Builder.getContext()), 2);
|
||||
Val = Builder.CreateBitCast(Val, V2BF16Ty);
|
||||
}
|
||||
}
|
||||
|
||||
llvm::AtomicRMWInst *RMW =
|
||||
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
|
||||
if (Volatile)
|
||||
RMW->setVolatile(true);
|
||||
return RMW;
|
||||
return Builder.CreateBitCast(RMW, OrigTy);
|
||||
}
|
||||
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
|
||||
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
|
||||
|
||||
@ -115,7 +115,7 @@ __global__
|
||||
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
|
||||
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
|
||||
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
|
||||
@ -112,7 +112,7 @@ __global__
|
||||
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
|
||||
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
|
||||
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
//
|
||||
|
||||
@ -11,9 +11,10 @@ typedef __attribute__((address_space(3))) float *LP;
|
||||
// CHECK: store ptr %addr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
|
||||
// CHECK: %[[ADDR_ADDR_ASCAST:.*]] = load ptr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
|
||||
// CHECK: %[[AS_CAST:.*]] = addrspacecast ptr %[[ADDR_ADDR_ASCAST]] to ptr addrspace(3)
|
||||
// CHECK: %3 = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %[[AS_CAST]]
|
||||
// CHECK: [[TMP2:%.+]] = load float, ptr %val.addr.ascast, align 4
|
||||
// CHECK: [[TMP3:%.+]] = atomicrmw fadd ptr addrspace(3) %[[AS_CAST]], float [[TMP2]] monotonic, align 4
|
||||
// CHECK: %4 = load ptr, ptr %rtn.ascast, align 8
|
||||
// CHECK: store float %3, ptr %4, align 4
|
||||
// CHECK: store float [[TMP3]], ptr %4, align 4
|
||||
__device__ void test_ds_atomic_add_f32(float *addr, float val) {
|
||||
float *rtn;
|
||||
*rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0);
|
||||
|
||||
@ -20,7 +20,7 @@ typedef __attribute__((address_space(3))) float *LP;
|
||||
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8
|
||||
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
|
||||
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4
|
||||
// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false)
|
||||
// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
|
||||
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8
|
||||
// CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
|
||||
// CHECK-NEXT: ret void
|
||||
|
||||
@ -117,13 +117,44 @@ void test_update_dpp(global int* out, int arg1, int arg2)
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_ds_fadd
|
||||
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
|
||||
// CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
|
||||
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src release, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
|
||||
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
|
||||
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
|
||||
#if !defined(__SPIRV__)
|
||||
void test_ds_faddf(local float *out, float src) {
|
||||
#else
|
||||
void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
|
||||
void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
|
||||
#endif
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);
|
||||
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, true);
|
||||
|
||||
// Test all orders.
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid
|
||||
|
||||
// Test all syncscopes.
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
|
||||
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
|
||||
}
|
||||
|
||||
// CHECK-LABEL: @test_ds_fmin
|
||||
|
||||
@ -10,7 +10,10 @@ typedef half __attribute__((ext_vector_type(2))) half2;
|
||||
typedef short __attribute__((ext_vector_type(2))) short2;
|
||||
|
||||
// CHECK-LABEL: test_local_add_2bf16
|
||||
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
|
||||
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
|
||||
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
|
||||
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
|
||||
|
||||
// GFX12-LABEL: test_local_add_2bf16
|
||||
// GFX12: ds_pk_add_rtn_bf16
|
||||
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
|
||||
@ -18,7 +21,10 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2bf16_noret
|
||||
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
|
||||
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
|
||||
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
|
||||
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
|
||||
|
||||
// GFX12-LABEL: test_local_add_2bf16_noret
|
||||
// GFX12: ds_pk_add_bf16
|
||||
void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
|
||||
@ -26,7 +32,7 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2f16
|
||||
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
|
||||
// GFX12-LABEL: test_local_add_2f16
|
||||
// GFX12: ds_pk_add_rtn_f16
|
||||
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
|
||||
@ -34,7 +40,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) {
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2f16_noret
|
||||
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
|
||||
// GFX12-LABEL: test_local_add_2f16_noret
|
||||
// GFX12: ds_pk_add_f16
|
||||
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
|
||||
// CHECK-LABEL: test_fadd_local
|
||||
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}}, i32 0, i32 0, i1 false)
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
|
||||
// GFX8-LABEL: test_fadd_local$local:
|
||||
// GFX8: ds_add_rtn_f32 v2, v0, v1
|
||||
// GFX8: s_endpgm
|
||||
@ -14,3 +14,10 @@ kernel void test_fadd_local(__local float *ptr, float val){
|
||||
float *res;
|
||||
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_fadd_local_volatile
|
||||
// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
|
||||
kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
|
||||
volatile float *res;
|
||||
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
|
||||
}
|
||||
|
||||
@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_ds_add_local_f64
|
||||
// CHECK: call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %{{.*}}, double %{{.*}},
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
|
||||
// GFX90A: test_ds_add_local_f64$local
|
||||
// GFX90A: ds_add_rtn_f64
|
||||
void test_ds_add_local_f64(__local double *addr, double x){
|
||||
@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_ds_addf_local_f32
|
||||
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}},
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
|
||||
// GFX90A-LABEL: test_ds_addf_local_f32$local
|
||||
// GFX90A: ds_add_rtn_f32
|
||||
void test_ds_addf_local_f32(__local float *addr, float x){
|
||||
|
||||
@ -42,7 +42,11 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2bf16
|
||||
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
|
||||
|
||||
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
|
||||
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
|
||||
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
|
||||
|
||||
// GFX940-LABEL: test_local_add_2bf16
|
||||
// GFX940: ds_pk_add_rtn_bf16
|
||||
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
|
||||
@ -50,7 +54,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2f16
|
||||
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
|
||||
// GFX940-LABEL: test_local_add_2f16
|
||||
// GFX940: ds_pk_add_rtn_f16
|
||||
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
|
||||
@ -58,7 +62,7 @@ half2 test_local_add_2f16(__local half2 *addr, half2 x) {
|
||||
}
|
||||
|
||||
// CHECK-LABEL: test_local_add_2f16_noret
|
||||
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
|
||||
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
|
||||
// GFX940-LABEL: test_local_add_2f16_noret
|
||||
// GFX940: ds_pk_add_f16
|
||||
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
|
||||
|
||||
@ -2933,8 +2933,7 @@ def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
|
||||
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
|
||||
[llvm_v2i16_ty],
|
||||
[LLVMQualPointerType<3>, llvm_v2i16_ty],
|
||||
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
|
||||
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
|
||||
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;
|
||||
|
||||
defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
|
||||
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user