
This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics. Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now orthogonal, whereas, before, code assumed that the latter implies the former at some places. Part of SWDEV-455845.
47 lines
2.3 KiB
LLVM
47 lines
2.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s
|
|
|
|
%ret_struct = type { half, half }
|
|
|
|
define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrspace(3) %arg1, ptr addrspace(3) %arg2, ptr addrspace(3) %arg3) {
|
|
; CHECK-LABEL: extracted_values:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: ds_read_b32 v3, v3
|
|
; CHECK-NEXT: ds_read_b32 v4, v4
|
|
; CHECK-NEXT: ds_read_b32 v2, v2
|
|
; CHECK-NEXT: ds_read_b32 v5, v5
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
|
|
; CHECK-NEXT: v_sub_f16_sdwa v6, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
|
; CHECK-NEXT: v_sub_f16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_sub_f16_sdwa v7, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
|
; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
|
; CHECK-NEXT: v_add_f16_e32 v4, v6, v7
|
|
; CHECK-NEXT: v_add_f16_e32 v2, v3, v2
|
|
; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2
|
|
; CHECK-NEXT: flat_store_dword v[0:1], v2
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: s_setpc_b64 s[30:31]
|
|
entry:
|
|
%tmp0 = load <2 x half>, ptr addrspace(3) %arg1, align 4
|
|
%tmp1 = extractelement <2 x half> %tmp0, i64 1
|
|
%tmp2 = load <2 x half>, ptr addrspace(3) %arg2, align 4
|
|
%tmp3 = extractelement <2 x half> %tmp2, i64 1
|
|
%tmp4 = fsub contract half %tmp1, %tmp3
|
|
%tmp5 = load <2 x half>, ptr addrspace(3) %arg0, align 4
|
|
%tmp6 = extractelement <2 x half> %tmp5, i64 1
|
|
%tmp7 = load <2 x half>, ptr addrspace(3) %arg3, align 4
|
|
%tmp8 = extractelement <2 x half> %tmp7, i64 1
|
|
%tmp9 = fsub contract half %tmp6, %tmp8
|
|
%tmp10 = fadd contract half %tmp4, %tmp9
|
|
%tmp11 = fsub contract half %tmp3, %tmp1
|
|
%tmp12 = fsub contract half %tmp8, %tmp6
|
|
%tmp13 = fadd contract half %tmp11, %tmp12
|
|
%field_ptr = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 0
|
|
store half %tmp10, ptr %field_ptr, align 2
|
|
%field_ptr1 = getelementptr %ret_struct, ptr %ret_struct, i32 0, i32 1
|
|
store half %tmp13, ptr %field_ptr1, align 2
|
|
ret void
|
|
}
|