llvm-project/llvm/test/CodeGen/AMDGPU/global_smrd_cfg.ll
Alexander Timofeev 18009560c5 [AMDGPU] Scalarization of global uniform loads.
Summary:
LC can currently select scalar load for uniform memory access
basing on readonly memory address space only. This restriction
originated from the fact that in HW prior to VI vector and scalar caches
are not coherent. With MemoryDependenceAnalysis we can check that the
memory location corresponding to the memory operand of the LOAD is not
clobbered along the all paths from the function entry.

Reviewers: rampitec, tstellarAMD, arsenm

Subscribers: wdng, arsenm, nhaehnle

Differential Revision: https://reviews.llvm.org/D26917

llvm-svn: 289076
2016-12-08 17:28:47 +00:00

81 lines
2.8 KiB
LLVM

; RUN: llc -mtriple amdgcn--amdhsa -mcpu=fiji -amdgpu-scalarize-global-loads=true -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: %bb11
; Load from %arg in a Loop body has alias store
; CHECK: flat_load_dword
; CHECK-LABEL: %bb20
; CHECK: flat_store_dword
; #####################################################################
; CHECK-LABEL: %bb22
; Load from %arg has alias store in Loop
; CHECK: flat_load_dword
; #####################################################################
; Load from %arg1 has no-alias store in Loop - arg1[i+1] never alias arg1[i]
; CHECK: s_load_dword
define amdgpu_kernel void @cfg(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture %arg1, i32 %arg2) #0 {
bb:
%tmp = sext i32 %arg2 to i64
%tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp
%tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4, !tbaa !0
%tmp5 = icmp sgt i32 %tmp4, 0
br i1 %tmp5, label %bb6, label %bb8
bb6: ; preds = %bb
br label %bb11
bb7: ; preds = %bb22
br label %bb8
bb8: ; preds = %bb7, %bb
%tmp9 = phi i32 [ 0, %bb ], [ %tmp30, %bb7 ]
%tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp
store i32 %tmp9, i32 addrspace(1)* %tmp10, align 4, !tbaa !0
ret void
bb11: ; preds = %bb22, %bb6
%tmp12 = phi i32 [ %tmp30, %bb22 ], [ 0, %bb6 ]
%tmp13 = phi i32 [ %tmp25, %bb22 ], [ 0, %bb6 ]
%tmp14 = srem i32 %tmp13, %arg2
%tmp15 = sext i32 %tmp14 to i64
%tmp16 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp15
%tmp17 = load i32, i32 addrspace(1)* %tmp16, align 4, !tbaa !0
%tmp18 = icmp sgt i32 %tmp17, 100
%tmp19 = sext i32 %tmp13 to i64
br i1 %tmp18, label %bb20, label %bb22
bb20: ; preds = %bb11
%tmp21 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp19
store i32 0, i32 addrspace(1)* %tmp21, align 4, !tbaa !0
br label %bb22
bb22: ; preds = %bb20, %bb11
%tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp19
%tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4, !tbaa !0
%tmp25 = add nuw nsw i32 %tmp13, 1
%tmp26 = sext i32 %tmp25 to i64
%tmp27 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp26
%tmp28 = load i32, i32 addrspace(1)* %tmp27, align 4, !tbaa !0
%tmp29 = add i32 %tmp24, %tmp12
%tmp30 = add i32 %tmp29, %tmp28
%tmp31 = icmp eq i32 %tmp25, %tmp4
br i1 %tmp31, label %bb7, label %bb11
}
attributes #0 = { "target-cpu"="fiji" }
!0 = !{!1, !1, i64 0}
!1 = !{!"int", !2, i64 0}
!2 = !{!"omnipotent char", !3, i64 0}
!3 = !{!"Simple C/C++ TBAA"}