From e0c2cc7ed0d4b0c1a567aa7b69d84f8792f8a703 Mon Sep 17 00:00:00 2001 From: vangthao95 Date: Thu, 5 Feb 2026 07:18:39 -0800 Subject: [PATCH] [AMDGPU][GlobalISel] Add buffer store byte/short RegBankLegalize rules (#179367) --- .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 6 +- .../llvm.amdgcn.raw.buffer.store.ll | 6 +- .../llvm.amdgcn.raw.ptr.buffer.store.ll | 2 +- .../llvm.amdgcn.struct.buffer.store.ll | 6 +- .../llvm.amdgcn.struct.ptr.buffer.store.ll | 2 +- ...ffer-fat-pointers-contents-legalization.ll | 470 +++++++++++++----- 6 files changed, 368 insertions(+), 124 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 7844d19ada72..26e409b17691 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -269,7 +269,8 @@ UniformityLLTOpPredicateID LLTToBId(LLT Ty) { return B64; if (Ty == LLT::fixed_vector(3, 32)) return B96; - if (Ty == LLT::fixed_vector(4, 32) || isAnyPtr(Ty, 128)) + if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) || + Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128)) return B128; return _; } @@ -1022,7 +1023,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Any( {{DivB128}, {{VgprB128}, {SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}}); - addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_FORMAT, + addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE, + G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT, G_AMDGPU_BUFFER_STORE_FORMAT_D16, G_AMDGPU_TBUFFER_STORE_FORMAT, G_AMDGPU_TBUFFER_STORE_FORMAT_D16}) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index c365d5711f6c..373b120c566a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1200 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefixes=GFX12,GFX1250 %s ; FIXME: Test with SI when argument lowering not broken for f16 ; Natural mapping diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll index a15b34dbb8c2..1c667e287f63 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.store.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -o - %s | FileCheck %s ; FIXME: Test with SI when argument lowering not broken for f16 ; Natural mapping diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll index c9771b5aca0d..6fb35ad5ce1a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.store.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1200 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -stop-after=instruction-select -o - %s | FileCheck -check-prefix=GFX1250 %s ; Natural mapping define amdgpu_ps void @struct_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll index f331e2917674..3a8e2e6e5f6c 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.store.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx810 -stop-after=instruction-select -o - %s | FileCheck %s ; Natural mapping define amdgpu_ps void @struct_ptr_buffer_store_f32_sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 867ec0488d19..5967d17c351e 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -new-reg-bank-select -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s ; Note: if you're adding tests here, also add them to ; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by @@ -173,6 +173,14 @@ define i128 @load_i128(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load i128, ptr addrspace(7) %p @@ -439,6 +447,14 @@ define <8 x i16> @load_v8i16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <8 x i16>, ptr addrspace(7) %p @@ -477,6 +493,14 @@ define <2 x i64> @load_v2i64(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <2 x i64>, ptr addrspace(7) %p @@ -667,6 +691,14 @@ define <8 x half> @load_v8f16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <8 x half>, ptr addrspace(7) %p @@ -1161,6 +1193,14 @@ define <2 x ptr addrspace(1)> @load_v2p1(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <2 x ptr addrspace(1)>, ptr addrspace(7) %p @@ -1199,6 +1239,10 @@ define <2 x ptr addrspace(5)> @load_v2p5(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <2 x ptr addrspace(5)>, ptr addrspace(7) %p @@ -1237,6 +1281,12 @@ define <3 x ptr addrspace(5)> @load_v3p5(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <3 x ptr addrspace(5)>, ptr addrspace(7) %p @@ -1275,6 +1325,14 @@ define <4 x ptr addrspace(5)> @load_v4p5(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <4 x ptr addrspace(5)>, ptr addrspace(7) %p @@ -1315,6 +1373,12 @@ define <6 x half> @load_v6f16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <6 x half>, ptr addrspace(7) %p @@ -1612,7 +1676,24 @@ define <4 x ptr addrspace(1)> @load_v4p1(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: buffer_load_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s8, v4 +; GISEL-NEXT: v_readfirstlane_b32 s9, v5 +; GISEL-NEXT: v_readfirstlane_b32 s10, v6 +; GISEL-NEXT: v_readfirstlane_b32 s11, v7 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: v_mov_b32_e32 v6, s10 +; GISEL-NEXT: v_mov_b32_e32 v7, s11 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <4 x ptr addrspace(1)>, ptr addrspace(7) %p @@ -1655,6 +1736,8 @@ define <1 x i16> @load_v1i16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <1 x i16>, ptr addrspace(7) %p @@ -1692,8 +1775,11 @@ define <3 x i16> @load_v3i16(ptr addrspace(8) inreg %buf) { ; GISEL-LABEL: load_v3i16: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ushort v1, off, s[16:19], 0 offset:4 +; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: v_readfirstlane_b32 s4, v1 +; GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -1734,8 +1820,11 @@ define <5 x i16> @load_v5i16(ptr addrspace(8) inreg %buf) { ; GISEL-LABEL: load_v5i16: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ushort v2, off, s[16:19], 0 offset:8 +; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: v_readfirstlane_b32 s4, v2 +; GISEL-NEXT: v_mov_b32_e32 v2, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -1777,6 +1866,12 @@ define <6 x i16> @load_v6i16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <6 x i16>, ptr addrspace(7) %p @@ -1816,7 +1911,16 @@ define <7 x i16> @load_v7i16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ushort v3, off, s[16:19], 0 offset:12 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <7 x i16>, ptr addrspace(7) %p @@ -1858,7 +1962,18 @@ define <9 x i16> @load_v9i16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:16 +; GISEL-NEXT: s_waitcnt vmcnt(1) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s8, v4 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <9 x i16>, ptr addrspace(7) %p @@ -1942,7 +2057,9 @@ define <2 x i8> @load_v2i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s4, s4, 8 +; GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <2 x i8>, ptr addrspace(7) %p @@ -1990,7 +2107,9 @@ define <3 x i8> @load_v3i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ubyte v2, off, s[16:19], 0 offset:2 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s4, s4, 8 +; GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -2040,9 +2159,13 @@ define <4 x i8> @load_v4i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s5, s4, 8 +; GISEL-NEXT: s_lshr_b32 s6, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <4 x i8>, ptr addrspace(7) %p @@ -2100,9 +2223,13 @@ define <5 x i8> @load_v5i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ubyte v4, off, s[16:19], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s5, s4, 8 +; GISEL-NEXT: s_lshr_b32 s6, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -2167,11 +2294,17 @@ define <6 x i8> @load_v6i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GISEL-NEXT: s_lshr_b32 s6, s4, 8 +; GISEL-NEXT: s_lshr_b32 s7, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s5, s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <6 x i8>, ptr addrspace(7) %p @@ -2238,11 +2371,17 @@ define <7 x i8> @load_v7i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 ; GISEL-NEXT: buffer_load_ubyte v6, off, s[16:19], 0 offset:6 ; GISEL-NEXT: s_waitcnt vmcnt(2) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GISEL-NEXT: s_lshr_b32 s6, s4, 8 +; GISEL-NEXT: s_lshr_b32 s7, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s5, s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -2311,14 +2450,21 @@ define <8 x i8> @load_v8i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v8, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: s_lshr_b32 s6, s4, 8 +; GISEL-NEXT: s_lshr_b32 s7, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s8, s5, 8 +; GISEL-NEXT: s_lshr_b32 s9, s5, 16 +; GISEL-NEXT: s_lshr_b32 s5, s5, 24 ; GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GISEL-NEXT: v_mov_b32_e32 v1, v8 +; GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s8 +; GISEL-NEXT: v_mov_b32_e32 v6, s9 +; GISEL-NEXT: v_mov_b32_e32 v7, s5 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <8 x i8>, ptr addrspace(7) %p @@ -2393,19 +2539,29 @@ define <12 x i8> @load_v12i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: s_lshr_b32 s7, s4, 8 +; GISEL-NEXT: s_lshr_b32 s8, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s9, s5, 8 +; GISEL-NEXT: s_lshr_b32 s10, s5, 16 +; GISEL-NEXT: s_lshr_b32 s5, s5, 24 +; GISEL-NEXT: s_lshr_b32 s11, s6, 8 +; GISEL-NEXT: s_lshr_b32 s12, s6, 16 +; GISEL-NEXT: s_lshr_b32 s6, s6, 24 ; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mov_b32_e32 v8, v2 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v12 +; GISEL-NEXT: v_mov_b32_e32 v1, s7 +; GISEL-NEXT: v_mov_b32_e32 v2, s8 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: v_mov_b32_e32 v6, s10 +; GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GISEL-NEXT: v_mov_b32_e32 v9, s11 +; GISEL-NEXT: v_mov_b32_e32 v10, s12 +; GISEL-NEXT: v_mov_b32_e32 v11, s6 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <12 x i8>, ptr addrspace(7) %p @@ -2495,24 +2651,37 @@ define <16 x i8> @load_v16i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v16, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 24, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: s_lshr_b32 s8, s4, 8 +; GISEL-NEXT: s_lshr_b32 s9, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s10, s5, 8 +; GISEL-NEXT: s_lshr_b32 s11, s5, 16 +; GISEL-NEXT: s_lshr_b32 s5, s5, 24 +; GISEL-NEXT: s_lshr_b32 s12, s6, 8 +; GISEL-NEXT: s_lshr_b32 s13, s6, 16 +; GISEL-NEXT: s_lshr_b32 s6, s6, 24 +; GISEL-NEXT: s_lshr_b32 s14, s7, 8 +; GISEL-NEXT: s_lshr_b32 s15, s7, 16 +; GISEL-NEXT: s_lshr_b32 s7, s7, 24 ; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GISEL-NEXT: v_mov_b32_e32 v12, v3 -; GISEL-NEXT: v_mov_b32_e32 v1, v16 -; GISEL-NEXT: v_mov_b32_e32 v2, v17 -; GISEL-NEXT: v_mov_b32_e32 v3, v18 +; GISEL-NEXT: v_mov_b32_e32 v1, s8 +; GISEL-NEXT: v_mov_b32_e32 v2, s9 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s10 +; GISEL-NEXT: v_mov_b32_e32 v6, s11 +; GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GISEL-NEXT: v_mov_b32_e32 v9, s12 +; GISEL-NEXT: v_mov_b32_e32 v10, s13 +; GISEL-NEXT: v_mov_b32_e32 v11, s6 +; GISEL-NEXT: v_mov_b32_e32 v13, s14 +; GISEL-NEXT: v_mov_b32_e32 v14, s15 +; GISEL-NEXT: v_mov_b32_e32 v15, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <16 x i8>, ptr addrspace(7) %p @@ -2629,43 +2798,69 @@ define <32 x i8> @load_v32i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: buffer_load_dwordx4 v[16:19], off, s[16:19], 0 offset:16 ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v35, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v36, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v37, 24, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v32, 8, v16 -; GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v16 -; GISEL-NEXT: v_lshrrev_b32_e32 v34, 24, v16 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 24, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v11, 24, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 8, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v15, 24, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v21, 8, v17 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v17 -; GISEL-NEXT: v_lshrrev_b32_e32 v23, 24, v17 -; GISEL-NEXT: v_lshrrev_b32_e32 v25, 8, v18 -; GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v18 -; GISEL-NEXT: v_lshrrev_b32_e32 v27, 24, v18 -; GISEL-NEXT: v_lshrrev_b32_e32 v29, 8, v19 -; GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v19 -; GISEL-NEXT: v_lshrrev_b32_e32 v31, 24, v19 +; GISEL-NEXT: v_readfirstlane_b32 s8, v16 +; GISEL-NEXT: v_readfirstlane_b32 s9, v17 +; GISEL-NEXT: v_readfirstlane_b32 s10, v18 +; GISEL-NEXT: v_readfirstlane_b32 s11, v19 +; GISEL-NEXT: s_lshr_b32 s12, s4, 8 +; GISEL-NEXT: s_lshr_b32 s13, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s14, s5, 8 +; GISEL-NEXT: s_lshr_b32 s15, s5, 16 +; GISEL-NEXT: s_lshr_b32 s5, s5, 24 +; GISEL-NEXT: s_lshr_b32 s16, s6, 8 +; GISEL-NEXT: s_lshr_b32 s17, s6, 16 +; GISEL-NEXT: s_lshr_b32 s6, s6, 24 +; GISEL-NEXT: s_lshr_b32 s18, s7, 8 +; GISEL-NEXT: s_lshr_b32 s19, s7, 16 +; GISEL-NEXT: s_lshr_b32 s7, s7, 24 +; GISEL-NEXT: s_lshr_b32 s20, s8, 8 +; GISEL-NEXT: s_lshr_b32 s21, s8, 16 +; GISEL-NEXT: s_lshr_b32 s8, s8, 24 +; GISEL-NEXT: s_lshr_b32 s22, s9, 8 +; GISEL-NEXT: s_lshr_b32 s23, s9, 16 +; GISEL-NEXT: s_lshr_b32 s9, s9, 24 +; GISEL-NEXT: s_lshr_b32 s24, s10, 8 +; GISEL-NEXT: s_lshr_b32 s25, s10, 16 +; GISEL-NEXT: s_lshr_b32 s10, s10, 24 +; GISEL-NEXT: s_lshr_b32 s26, s11, 8 +; GISEL-NEXT: s_lshr_b32 s27, s11, 16 +; GISEL-NEXT: s_lshr_b32 s11, s11, 24 ; GISEL-NEXT: v_mov_b32_e32 v4, v1 ; GISEL-NEXT: v_mov_b32_e32 v8, v2 ; GISEL-NEXT: v_mov_b32_e32 v12, v3 ; GISEL-NEXT: v_mov_b32_e32 v20, v17 ; GISEL-NEXT: v_mov_b32_e32 v24, v18 ; GISEL-NEXT: v_mov_b32_e32 v28, v19 -; GISEL-NEXT: v_mov_b32_e32 v1, v35 -; GISEL-NEXT: v_mov_b32_e32 v2, v36 -; GISEL-NEXT: v_mov_b32_e32 v3, v37 -; GISEL-NEXT: v_mov_b32_e32 v17, v32 -; GISEL-NEXT: v_mov_b32_e32 v18, v33 -; GISEL-NEXT: v_mov_b32_e32 v19, v34 +; GISEL-NEXT: v_mov_b32_e32 v1, s12 +; GISEL-NEXT: v_mov_b32_e32 v2, s13 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s14 +; GISEL-NEXT: v_mov_b32_e32 v6, s15 +; GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GISEL-NEXT: v_mov_b32_e32 v9, s16 +; GISEL-NEXT: v_mov_b32_e32 v10, s17 +; GISEL-NEXT: v_mov_b32_e32 v11, s6 +; GISEL-NEXT: v_mov_b32_e32 v13, s18 +; GISEL-NEXT: v_mov_b32_e32 v14, s19 +; GISEL-NEXT: v_mov_b32_e32 v15, s7 +; GISEL-NEXT: v_mov_b32_e32 v17, s20 +; GISEL-NEXT: v_mov_b32_e32 v18, s21 +; GISEL-NEXT: v_mov_b32_e32 v19, s8 +; GISEL-NEXT: v_mov_b32_e32 v21, s22 +; GISEL-NEXT: v_mov_b32_e32 v22, s23 +; GISEL-NEXT: v_mov_b32_e32 v23, s9 +; GISEL-NEXT: v_mov_b32_e32 v25, s24 +; GISEL-NEXT: v_mov_b32_e32 v26, s25 +; GISEL-NEXT: v_mov_b32_e32 v27, s10 +; GISEL-NEXT: v_mov_b32_e32 v29, s26 +; GISEL-NEXT: v_mov_b32_e32 v30, s27 +; GISEL-NEXT: v_mov_b32_e32 v31, s11 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <32 x i8>, ptr addrspace(7) %p @@ -2871,7 +3066,9 @@ define [2 x half] @load_a2f16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s4, s4, 16 +; GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load [2 x half], ptr addrspace(7) %p @@ -2914,6 +3111,14 @@ define [2 x ptr addrspace(1)] @load_a2p1(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx4 v[0:3], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_readfirstlane_b32 s7, v3 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load [2 x ptr addrspace(1)], ptr addrspace(7) %p @@ -2955,19 +3160,23 @@ define i40 @load_i40(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: buffer_load_ubyte v1, off, s[16:19], 0 offset:4 -; GISEL-NEXT: v_mov_b32_e32 v2, 0xff ; GISEL-NEXT: s_waitcnt vmcnt(1) -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v4, 24, v0 -; GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GISEL-NEXT: v_and_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GISEL-NEXT: v_lshlrev_b16_e32 v4, 8, v4 -; GISEL-NEXT: v_lshlrev_b16_e32 v3, 8, v3 -; GISEL-NEXT: v_or_b32_e32 v2, v2, v4 -; GISEL-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s5, s4, 8 +; GISEL-NEXT: s_lshr_b32 s6, s4, 16 +; GISEL-NEXT: s_lshr_b32 s7, s4, 24 +; GISEL-NEXT: s_and_b32 s5, s5, 0xff +; GISEL-NEXT: s_and_b32 s6, s6, 0xff +; GISEL-NEXT: s_lshl_b32 s7, s7, 8 +; GISEL-NEXT: s_and_b32 s4, s4, 0xff +; GISEL-NEXT: s_lshl_b32 s5, s5, 8 +; GISEL-NEXT: s_or_b32 s6, s6, s7 +; GISEL-NEXT: s_or_b32 s4, s4, s5 +; GISEL-NEXT: s_and_b32 s5, 0xffff, s6 +; GISEL-NEXT: s_and_b32 s4, 0xffff, s4 +; GISEL-NEXT: s_lshl_b32 s5, s5, 16 +; GISEL-NEXT: s_or_b32 s4, s4, s5 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) @@ -3009,6 +3218,12 @@ define i96 @load_i96(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx3 v[0:2], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v1 +; GISEL-NEXT: v_readfirstlane_b32 s6, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s4 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load i96, ptr addrspace(7) %p @@ -3221,7 +3436,9 @@ define <2 x i4> @load_v2i4(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_ubyte v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s4, s4, 4 +; GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <2 x i4>, ptr addrspace(7) %p @@ -3279,9 +3496,13 @@ define <4 x i4> @load_v4i4(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s5, s4, 4 +; GISEL-NEXT: s_lshr_b32 s6, s4, 8 +; GISEL-NEXT: s_lshr_b32 s4, s4, 12 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <4 x i4>, ptr addrspace(7) %p @@ -3347,13 +3568,21 @@ define <8 x i4> @load_v8i4(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 4, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 12, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 24, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 28, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s5, s4, 4 +; GISEL-NEXT: s_lshr_b32 s6, s4, 8 +; GISEL-NEXT: s_lshr_b32 s7, s4, 12 +; GISEL-NEXT: s_lshr_b32 s8, s4, 16 +; GISEL-NEXT: s_lshr_b32 s9, s4, 20 +; GISEL-NEXT: s_lshr_b32 s10, s4, 24 +; GISEL-NEXT: s_lshr_b32 s4, s4, 28 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s7 +; GISEL-NEXT: v_mov_b32_e32 v4, s8 +; GISEL-NEXT: v_mov_b32_e32 v5, s9 +; GISEL-NEXT: v_mov_b32_e32 v6, s10 +; GISEL-NEXT: v_mov_b32_e32 v7, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <8 x i4>, ptr addrspace(7) %p @@ -3429,7 +3658,10 @@ define <2 x i6> @load_v2i6(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_ushort v0, off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b16_e32 v1, 6, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_and_b32 s4, 0xffff, s4 +; GISEL-NEXT: s_lshr_b32 s4, s4, 6 +; GISEL-NEXT: v_mov_b32_e32 v1, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <2 x i6>, ptr addrspace(7) %p @@ -3528,9 +3760,13 @@ define <4 x i8> @volatile_load_v4i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dword v0, off, s[16:19], 0 glc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: s_lshr_b32 s5, s4, 8 +; GISEL-NEXT: s_lshr_b32 s6, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: v_mov_b32_e32 v1, s5 +; GISEL-NEXT: v_mov_b32_e32 v2, s6 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load volatile <4 x i8>, ptr addrspace(7) %p @@ -3593,10 +3829,16 @@ define <6 x i8> @volatile_load_v6i8(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: buffer_load_ushort v4, off, s[16:19], 0 offset:4 glc ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 8, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 24, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 8, v4 +; GISEL-NEXT: v_readfirstlane_b32 s4, v0 +; GISEL-NEXT: v_readfirstlane_b32 s5, v4 +; GISEL-NEXT: s_lshr_b32 s6, s4, 8 +; GISEL-NEXT: s_lshr_b32 s7, s4, 16 +; GISEL-NEXT: s_lshr_b32 s4, s4, 24 +; GISEL-NEXT: s_lshr_b32 s5, s5, 8 +; GISEL-NEXT: v_mov_b32_e32 v1, s6 +; GISEL-NEXT: v_mov_b32_e32 v2, s7 +; GISEL-NEXT: v_mov_b32_e32 v3, s4 +; GISEL-NEXT: v_mov_b32_e32 v5, s5 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load volatile <6 x i8>, ptr addrspace(7) %p