llvm-project/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll
Fabian Ritter 173c68239d
[AMDGPU] Enable unaligned scratch accesses (#110219)
This allows us to emit wide generic and scratch memory accesses when we
do not have alignment information. In cases where accesses happen to be
properly aligned or where generic accesses do not go to scratch memory,
this improves performance of the generated code by a factor of up to 16x
and reduces code size, especially when lowering memcpy and memmove
intrinsics.

Also: Make the use of the FeatureUnalignedScratchAccess feature more
consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now
orthogonal, whereas, before, code assumed that the latter implies the
former at some places.

Part of SWDEV-455845.
2024-10-11 08:50:49 +02:00

47 lines
2.3 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s
%ret_struct = type { half, half }
define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrspace(3) %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) {
; CHECK-LABEL: extracted_values:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ds_read_b32 v3, v3
; CHECK-NEXT: ds_read_b32 v4, v4
; CHECK-NEXT: ds_read_b32 v2, v2
; CHECK-NEXT: ds_read_b32 v5, v5
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
; CHECK-NEXT: v_sub_f16_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; CHECK-NEXT: v_add_f16_e32 v4, v6, v7
; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
%tmp0 = load <2 x half>, ptr addrspace(3) %arg1, align 4
%tmp1 = extractelement <2 x half> %tmp0, i64 1
%tmp2 = load <2 x half>, ptr addrspace(3) %arg2, align 4
%tmp3 = extractelement <2 x half> %tmp2, i64 1
%tmp4 = fsub contract half %tmp1, %tmp3
%tmp5 = load <2 x half>, ptr addrspace(3) %arg0, align 4
%tmp6 = extractelement <2 x half> %tmp5, i64 1
%tmp7 = load <2 x half>, ptr addrspace(3) %arg3, align 4
%tmp8 = extractelement <2 x half> %tmp7, i64 1
%tmp9 = fsub contract half %tmp6, %tmp8
%tmp10 = fadd contract half %tmp4, %tmp9
%tmp11 = fsub contract half %tmp3, %tmp1
%tmp12 = fsub contract half %tmp8, %tmp6
%tmp13 = fadd contract half %tmp11, %tmp12
%field_ptr = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 0
store half %tmp10, ptr %field_ptr, align 2
%field_ptr1 = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 1
store half %tmp13, ptr %field_ptr1, align 2
ret void
}