
Note: This relands #140615 adding a ".count" suffix to the non-".all" variants. Our current intrinsic support for barrier intrinsics is confusing and incomplete, with multiple intrinsics mapping to the same instruction and intrinsic names not clearly conveying intrinsic semantics. Further, we lack support for some variants. This change unifies the IR representation to a single consistently named set of intrinsics. - llvm.nvvm.barrier.cta.sync.aligned.all(i32) - llvm.nvvm.barrier.cta.sync.aligned.count(i32, i32) - llvm.nvvm.barrier.cta.arrive.aligned.count(i32, i32) - llvm.nvvm.barrier.cta.sync.all(i32) - llvm.nvvm.barrier.cta.sync.count(i32, i32) - llvm.nvvm.barrier.cta.arrive.count(i32, i32) The following Auto-Upgrade rules are used to maintain compatibility with IR using the legacy intrinsics: * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned.count(x, y) * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync.count(x, y)
63 lines
1.3 KiB
LLVM
63 lines
1.3 KiB
LLVM
; RUN: opt -S -passes=jump-threading,verify < %s | FileCheck %s
|
|
|
|
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
|
|
target triple = "nvptx64-nvidia-cuda"
|
|
|
|
$wrapped_tid = comdat any
|
|
|
|
$foo = comdat any
|
|
|
|
define i32 @wrapped_tid() #0 comdat align 32 {
|
|
%1 = call i32 @tid()
|
|
ret i32 %1
|
|
}
|
|
|
|
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #1
|
|
|
|
; We had a bug where we duplicated basic blocks containing convergent
|
|
; functions like @llvm.nvvm.barrier0 below. Verify that we don't do
|
|
; that.
|
|
define void @foo() local_unnamed_addr #2 comdat align 32 {
|
|
; CHECK-LABEL: @foo
|
|
%1 = call i32 @tid()
|
|
%2 = urem i32 %1, 7
|
|
br label %3
|
|
|
|
3:
|
|
%4 = icmp eq i32 %1, 0
|
|
br i1 %4, label %5, label %6
|
|
|
|
5:
|
|
call void @bar()
|
|
br label %6
|
|
|
|
6:
|
|
; CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
|
|
; CHECK-NOT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
|
|
call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
|
|
%7 = icmp eq i32 %2, 0
|
|
br i1 %7, label %11, label %8
|
|
|
|
8:
|
|
%9 = icmp ult i32 %1, 49
|
|
br i1 %9, label %10, label %11
|
|
|
|
10:
|
|
call void @llvm.trap()
|
|
unreachable
|
|
|
|
11:
|
|
br label %3
|
|
}
|
|
|
|
declare i32 @tid() #2
|
|
|
|
declare void @bar()
|
|
|
|
declare void @llvm.trap() #3
|
|
|
|
attributes #1 = { convergent }
|
|
attributes #2 = { readnone }
|
|
attributes #3 = { noreturn }
|
|
attributes #4 = { convergent }
|