; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -o - -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s ; RUN: %if ptxas-12.8 %{ llc < %s -mcpu=sm_90 -mattr=+ptx78| %ptxas-verify -arch=sm_90 %} target triple = "nvptx64-nvidia-cuda" declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32) declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr) declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() declare ptr @llvm.nvvm.mapa(ptr, i32) ; Common setup for distributed shared memory cluster addressing define i32 @test_distributed_shared_cluster_common(ptr %ptr, ptr addrspace(3) %smem_ptr) local_unnamed_addr { ; CHECK-LABEL: test_distributed_shared_cluster_common( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<3>; ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_common_param_0]; ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_common_param_1]; ; CHECK-NEXT: mov.u32 %r1, %ctaid.x; ; CHECK-NEXT: xor.b32 %r2, %r1, 1; ; CHECK-NEXT: isspacep.shared::cluster %p1, %rd1; ; CHECK-NEXT: mapa.u64 %rd3, %rd1, %r2; ; CHECK-NEXT: isspacep.shared::cluster %p2, %rd3; ; CHECK-NEXT: mapa.shared::cluster.u64 %rd4, %rd2, %r2; ; CHECK-NEXT: ld.shared::cluster.b32 %r3, [%rd4]; ; CHECK-NEXT: add.s32 %r4, %r3, 42; ; CHECK-NEXT: st.shared::cluster.b32 [%rd4], %r4; ; CHECK-NEXT: selp.b32 %r5, 1, 0, %p1; ; CHECK-NEXT: selp.b32 %r6, 1, 0, %p2; ; CHECK-NEXT: add.s32 %r7, %r5, %r6; ; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; entry: %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() %1 = xor i32 %0, 1 %2 = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %ptr) %3 = tail call ptr @llvm.nvvm.mapa(ptr %ptr, i32 %1) %4 = tail call i1 @llvm.nvvm.isspacep.shared.cluster(ptr %3) %dsmem_ptr = call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %smem_ptr, i32 %1) ; Add load and store to the distributed shared memory cluster %loaded_val = load i32, ptr addrspace(7) %dsmem_ptr %updated_val = add i32 %loaded_val, 42 store i32 %updated_val, ptr addrspace(7) %dsmem_ptr ; Return value preserves the isspacep test results plus the value operation %5 = zext i1 %2 to i32 %6 = zext i1 %4 to i32 %ret = add i32 %5, %6 ret i32 %ret } ; Floating point atomic operations tests define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { ; CHECK-LABEL: test_distributed_shared_cluster_float_atomic( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<5>; ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0]; ; CHECK-NEXT: mov.b16 %rs1, 0x3C00; ; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1; ; CHECK-NEXT: mov.b16 %rs3, 0x3F80; ; CHECK-NEXT: atom.shared::cluster.add.noftz.bf16 %rs4, [%rd1], %rs3; ; CHECK-NEXT: atom.shared::cluster.add.f32 %r1, [%rd1], 0f3F800000; ; CHECK-NEXT: atom.shared::cluster.add.f64 %rd2, [%rd1], 0d3FF0000000000000; ; CHECK-NEXT: ret; entry: ; Floating point atomic operations %0 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, half 1.000000e+00 seq_cst %1 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, bfloat 1.000000e+00 seq_cst %2 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, float 1.000000e+00 seq_cst %3 = atomicrmw fadd ptr addrspace(7) %dsmem_ptr, double 1.000000e+00 seq_cst ret void } ; Integer atomic operations tests define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { ; CHECK-LABEL: test_distributed_shared_cluster_int_atomic( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0]; ; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.exch.b64 %rd3, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.min.s32 %r3, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.min.s64 %rd4, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.min.u32 %r4, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.min.u64 %rd5, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.max.s32 %r5, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.max.s64 %rd6, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.max.u32 %r6, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.max.u64 %rd7, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.inc.u32 %r7, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.dec.u32 %r8, [%rd1], 1; ; CHECK-NEXT: ret; entry: ; Integer add operations %0 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %1 = atomicrmw add ptr addrspace(7) %dsmem_ptr, i64 1 monotonic ; Exchange operations %2 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %3 = atomicrmw xchg ptr addrspace(7) %dsmem_ptr, i64 1 monotonic ; Min operations (signed and unsigned) %4 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %5 = atomicrmw min ptr addrspace(7) %dsmem_ptr, i64 1 monotonic %6 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %7 = atomicrmw umin ptr addrspace(7) %dsmem_ptr, i64 1 monotonic ; Max operations (signed and unsigned) %8 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %9 = atomicrmw max ptr addrspace(7) %dsmem_ptr, i64 1 monotonic %10 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %11 = atomicrmw umax ptr addrspace(7) %dsmem_ptr, i64 1 monotonic ; Inc/Dec operations (32-bit only) %12 = atomicrmw uinc_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %13 = atomicrmw udec_wrap ptr addrspace(7) %dsmem_ptr, i32 1 monotonic ret void } ; Bitwise atomic operations tests define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { ; CHECK-LABEL: test_distributed_shared_cluster_bitwise_atomic( ; CHECK: { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0]; ; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.or.b64 %rd3, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.xor.b32 %r3, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.xor.b64 %rd4, [%rd1], 1; ; CHECK-NEXT: ret; entry: ; Bitwise operations %0 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %1 = atomicrmw and ptr addrspace(7) %dsmem_ptr, i64 1 monotonic %2 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %3 = atomicrmw or ptr addrspace(7) %dsmem_ptr, i64 1 monotonic %4 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i32 1 monotonic %5 = atomicrmw xor ptr addrspace(7) %dsmem_ptr, i64 1 monotonic ret void } ; Compare-exchange operations tests define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr) local_unnamed_addr { ; CHECK-LABEL: test_distributed_shared_cluster_cmpxchg( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<11>; ; CHECK-NEXT: .reg .b32 %r<43>; ; CHECK-NEXT: .reg .b64 %rd<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r14, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r15, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r16, [%rd2], 1, 0; ; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b32 %r17, [%rd2], 1, 0; ; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r18, [%rd2], 1, 0; ; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b32 %r19, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r20, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r21, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b32 %r22, [%rd2], 1, 0; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b64 %rd3, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd4, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd5, [%rd2], 1, 0; ; CHECK-NEXT: atom.release.sys.shared::cluster.cas.b64 %rd6, [%rd2], 1, 0; ; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd7, [%rd2], 1, 0; ; CHECK-NEXT: atom.acq_rel.sys.shared::cluster.cas.b64 %rd8, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd9, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd10, [%rd2], 1, 0; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: atom.acquire.sys.shared::cluster.cas.b64 %rd11, [%rd2], 1, 0; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; ; CHECK-NEXT: cvt.u32.u64 %r23, %rd2; ; CHECK-NEXT: and.b32 %r24, %r23, 3; ; CHECK-NEXT: shl.b32 %r1, %r24, 3; ; CHECK-NEXT: mov.b32 %r25, 65535; ; CHECK-NEXT: shl.b32 %r26, %r25, %r1; ; CHECK-NEXT: not.b32 %r2, %r26; ; CHECK-NEXT: mov.b32 %r27, 1; ; CHECK-NEXT: shl.b32 %r3, %r27, %r1; ; CHECK-NEXT: ld.shared::cluster.b32 %r28, [%rd1]; ; CHECK-NEXT: and.b32 %r38, %r28, %r2; ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r29, %r38, %r3; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r4, [%rd1], %r29, %r38; ; CHECK-NEXT: setp.eq.b32 %p1, %r4, %r29; ; CHECK-NEXT: @%p1 bra $L__BB4_3; ; CHECK-NEXT: // %bb.2: // %partword.cmpxchg.failure32 ; CHECK-NEXT: // in Loop: Header=BB4_1 Depth=1 ; CHECK-NEXT: and.b32 %r5, %r4, %r2; ; CHECK-NEXT: setp.ne.b32 %p2, %r38, %r5; ; CHECK-NEXT: mov.b32 %r38, %r5; ; CHECK-NEXT: @%p2 bra $L__BB4_1; ; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31 ; CHECK-NEXT: ld.shared::cluster.b32 %r30, [%rd1]; ; CHECK-NEXT: and.b32 %r39, %r30, %r2; ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r31, %r39, %r3; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r6, [%rd1], %r31, %r39; ; CHECK-NEXT: setp.eq.b32 %p3, %r6, %r31; ; CHECK-NEXT: @%p3 bra $L__BB4_6; ; CHECK-NEXT: // %bb.5: // %partword.cmpxchg.failure22 ; CHECK-NEXT: // in Loop: Header=BB4_4 Depth=1 ; CHECK-NEXT: and.b32 %r7, %r6, %r2; ; CHECK-NEXT: setp.ne.b32 %p4, %r39, %r7; ; CHECK-NEXT: mov.b32 %r39, %r7; ; CHECK-NEXT: @%p4 bra $L__BB4_4; ; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: ld.shared::cluster.b32 %r32, [%rd1]; ; CHECK-NEXT: and.b32 %r40, %r32, %r2; ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r33, %r40, %r3; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r8, [%rd1], %r33, %r40; ; CHECK-NEXT: setp.eq.b32 %p5, %r8, %r33; ; CHECK-NEXT: @%p5 bra $L__BB4_9; ; CHECK-NEXT: // %bb.8: // %partword.cmpxchg.failure12 ; CHECK-NEXT: // in Loop: Header=BB4_7 Depth=1 ; CHECK-NEXT: and.b32 %r9, %r8, %r2; ; CHECK-NEXT: setp.ne.b32 %p6, %r40, %r9; ; CHECK-NEXT: mov.b32 %r40, %r9; ; CHECK-NEXT: @%p6 bra $L__BB4_7; ; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: ld.shared::cluster.b32 %r34, [%rd1]; ; CHECK-NEXT: and.b32 %r41, %r34, %r2; ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r35, %r41, %r3; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r10, [%rd1], %r35, %r41; ; CHECK-NEXT: setp.eq.b32 %p7, %r10, %r35; ; CHECK-NEXT: @%p7 bra $L__BB4_12; ; CHECK-NEXT: // %bb.11: // %partword.cmpxchg.failure2 ; CHECK-NEXT: // in Loop: Header=BB4_10 Depth=1 ; CHECK-NEXT: and.b32 %r11, %r10, %r2; ; CHECK-NEXT: setp.ne.b32 %p8, %r41, %r11; ; CHECK-NEXT: mov.b32 %r41, %r11; ; CHECK-NEXT: @%p8 bra $L__BB4_10; ; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.sc.sys; ; CHECK-NEXT: ld.shared::cluster.b32 %r36, [%rd1]; ; CHECK-NEXT: and.b32 %r42, %r36, %r2; ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: or.b32 %r37, %r42, %r3; ; CHECK-NEXT: atom.relaxed.sys.shared::cluster.cas.b32 %r12, [%rd1], %r37, %r42; ; CHECK-NEXT: setp.eq.b32 %p9, %r12, %r37; ; CHECK-NEXT: @%p9 bra $L__BB4_15; ; CHECK-NEXT: // %bb.14: // %partword.cmpxchg.failure ; CHECK-NEXT: // in Loop: Header=BB4_13 Depth=1 ; CHECK-NEXT: and.b32 %r13, %r12, %r2; ; CHECK-NEXT: setp.ne.b32 %p10, %r42, %r13; ; CHECK-NEXT: mov.b32 %r42, %r13; ; CHECK-NEXT: @%p10 bra $L__BB4_13; ; CHECK-NEXT: $L__BB4_15: // %partword.cmpxchg.end ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: ret; entry: ; Compare-exchange operation - all memory ordering combinations for 32-bit %0 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 monotonic monotonic %1 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire monotonic %2 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acquire acquire %3 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 release monotonic %4 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel monotonic %5 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 acq_rel acquire %6 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst monotonic %7 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst acquire %8 = cmpxchg ptr addrspace(7) %dsmem_ptr, i32 1, i32 0 seq_cst seq_cst ; Compare-exchange operation - all memory ordering combinations for 64-bit %9 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 monotonic monotonic %10 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire monotonic %11 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acquire acquire %12 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 release monotonic %13 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel monotonic %14 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 acq_rel acquire %15 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst monotonic %16 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst acquire %17 = cmpxchg ptr addrspace(7) %dsmem_ptr, i64 1, i64 0 seq_cst seq_cst ; Compare-exchange operation - 16-bit %18 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 monotonic monotonic %19 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acquire acquire %20 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 release monotonic %21 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 acq_rel acquire %22 = cmpxchg ptr addrspace(7) %dsmem_ptr, i16 1, i16 0 seq_cst seq_cst ret void }