AMDGPU/GlobalISel: Regbanklegalize for G_CONCAT_VECTORS (#171471)

RegBankLegalize using trivial mapping helper, assigns same reg bank
to all operands, vgpr or sgpr.
Uncovers multiple codegen and regbank combiner regressions related to
looking through sgpr to vgpr copies.
Skip regbankselect-concat-vector.mir since agprs are not yet supported.
This commit is contained in:
Petar Avramovic 2025-12-15 10:37:40 +01:00 committed by GitHub
parent f3e508ceec
commit f024026a21
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 525 additions and 382 deletions

View File

@ -443,7 +443,8 @@ bool AMDGPURegBankLegalize::runOnMachineFunction(MachineFunction &MF) {
// Opcodes that support pretty much all combinations of reg banks and LLTs
// (except S1). There is no point in writing rules for them.
if (Opc == AMDGPU::G_BUILD_VECTOR || Opc == AMDGPU::G_UNMERGE_VALUES ||
Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_BITCAST) {
Opc == AMDGPU::G_MERGE_VALUES || Opc == AMDGPU::G_CONCAT_VECTORS ||
Opc == AMDGPU::G_BITCAST) {
RBLHelper.applyMappingTrivial(*MI);
continue;
}

View File

@ -1,6 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
; FIXME: codegen regression, related to:
; - looking through s16 sgpr to vgpr copy
; on G_BUILD_VECTOR with G_IMPLICIT_DEF input
define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrspace(1) %ptr2) {
; GFX8-LABEL: add_v3i16:
@ -40,25 +44,30 @@ define void @add_v3i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: global_load_ushort v7, v[2:3], off
; GFX9-NEXT: global_load_ushort v8, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:4
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT: global_load_ushort v8, v[2:3], off
; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v10, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:2
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v6
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v7
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_pk_add_u16 v2, v9, v8
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v9
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshl_or_b32 v0, v10, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v1, v11, 16, v1
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
; GFX9-NEXT: v_lshl_or_b32 v2, v11, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1
; GFX9-NEXT: v_lshl_or_b32 v3, s4, 16, v3
; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT: global_store_short v[4:5], v2, off offset:4
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <3 x i16>, ptr addrspace(1) %ptra, align 4
@ -206,10 +215,10 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT: global_load_ushort v8, v[2:3], off
; GFX9-NEXT: global_load_ushort v9, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:8
; GFX9-NEXT: global_load_ushort v11, v[0:1], off offset:8
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
; GFX9-NEXT: global_load_ushort v9, v[2:3], off
; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8
; GFX9-NEXT: global_load_ushort v12, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:6
; GFX9-NEXT: global_load_ushort v14, v[2:3], off offset:2
@ -222,23 +231,28 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v8
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v9
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v10
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_pk_add_u16 v6, v11, v10
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v11
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshl_or_b32 v0, v12, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshl_or_b32 v1, v13, 16, v1
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshl_or_b32 v2, v14, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v3, v14, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v2, s4, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v3, v15, 16, v3
; GFX9-NEXT: v_pk_add_u16 v0, v0, v2
; GFX9-NEXT: v_pk_add_u16 v1, v1, v3
; GFX9-NEXT: v_lshl_or_b32 v6, v15, 16, v6
; GFX9-NEXT: v_lshl_or_b32 v7, s4, 16, v7
; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6
; GFX9-NEXT: global_store_short v[4:5], v6, off offset:8
; GFX9-NEXT: global_store_short v[4:5], v2, off offset:8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <5 x i16>, ptr addrspace(1) %ptra, align 4
@ -421,11 +435,11 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: global_load_ushort v6, v[0:1], off
; GFX9-NEXT: global_load_ushort v7, v[0:1], off offset:4
; GFX9-NEXT: global_load_ushort v8, v[0:1], off offset:8
; GFX9-NEXT: global_load_ushort v9, v[2:3], off
; GFX9-NEXT: global_load_ushort v10, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:8
; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:12
; GFX9-NEXT: global_load_ushort v13, v[0:1], off offset:12
; GFX9-NEXT: global_load_ushort v9, v[0:1], off offset:12
; GFX9-NEXT: global_load_ushort v10, v[2:3], off
; GFX9-NEXT: global_load_ushort v11, v[2:3], off offset:4
; GFX9-NEXT: global_load_ushort v12, v[2:3], off offset:8
; GFX9-NEXT: global_load_ushort v13, v[2:3], off offset:12
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:2
; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:6
; GFX9-NEXT: global_load_ushort v16, v[0:1], off offset:10
@ -444,8 +458,10 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v10
; GFX9-NEXT: s_waitcnt vmcnt(8)
; GFX9-NEXT: v_and_b32_e32 v7, 0xffff, v11
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_and_b32_e32 v8, 0xffff, v12
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_pk_add_u16 v8, v13, v12
; GFX9-NEXT: v_and_b32_e32 v9, 0xffff, v13
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: v_lshl_or_b32 v0, v14, 16, v0
; GFX9-NEXT: s_waitcnt vmcnt(4)
@ -453,21 +469,24 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_lshl_or_b32 v2, v16, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshl_or_b32 v3, v17, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v6, v17, 16, v6
; GFX9-NEXT: v_lshl_or_b32 v3, s4, 16, v3
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v6
; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v7, v19, 16, v7
; GFX9-NEXT: v_pk_add_u16 v0, v0, v3
; GFX9-NEXT: v_pk_add_u16 v1, v1, v6
; GFX9-NEXT: v_pk_add_u16 v2, v2, v7
; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v8
; GFX9-NEXT: v_lshl_or_b32 v9, s4, 16, v9
; GFX9-NEXT: v_pk_add_u16 v0, v0, v6
; GFX9-NEXT: v_pk_add_u16 v1, v1, v7
; GFX9-NEXT: v_pk_add_u16 v2, v2, v8
; GFX9-NEXT: v_pk_add_u16 v3, v3, v9
; GFX9-NEXT: global_store_short v[4:5], v0, off
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:2
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v1, off offset:6
; GFX9-NEXT: global_store_short v[4:5], v2, off offset:8
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v2, off offset:10
; GFX9-NEXT: global_store_short v[4:5], v8, off offset:12
; GFX9-NEXT: global_store_short v[4:5], v3, off offset:12
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <7 x i16>, ptr addrspace(1) %ptra, align 4
@ -542,17 +561,22 @@ define void @add_v9i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX9-LABEL: add_v9i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v6, v14, v15
; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
; GFX9-NEXT: v_pk_add_u16 v1, v7, v11
; GFX9-NEXT: v_pk_add_u16 v2, v8, v12
; GFX9-NEXT: v_pk_add_u16 v3, v9, v13
; GFX9-NEXT: v_lshl_or_b32 v6, s4, 16, v14
; GFX9-NEXT: v_lshl_or_b32 v7, s4, 16, v15
; GFX9-NEXT: v_pk_add_u16 v6, v6, v7
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: global_store_short v[4:5], v6, off offset:16
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -716,33 +740,40 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX9-LABEL: add_v11i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20
; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20
; GFX9-NEXT: global_load_ushort v15, v[0:1], off offset:20
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:16
; GFX9-NEXT: global_load_ushort v17, v[2:3], off offset:20
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
; GFX9-NEXT: v_pk_add_u16 v1, v7, v11
; GFX9-NEXT: v_pk_add_u16 v2, v8, v12
; GFX9-NEXT: v_pk_add_u16 v3, v9, v13
; GFX9-NEXT: v_and_b32_e32 v16, 0xffff, v16
; GFX9-NEXT: s_waitcnt vmcnt(3)
; GFX9-NEXT: v_and_b32_e32 v17, 0xffff, v17
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_pk_add_u16 v0, v10, v6
; GFX9-NEXT: v_pk_add_u16 v1, v11, v7
; GFX9-NEXT: v_pk_add_u16 v2, v12, v8
; GFX9-NEXT: v_pk_add_u16 v3, v13, v9
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshl_or_b32 v7, v18, 16, v14
; GFX9-NEXT: v_lshl_or_b32 v6, v18, 16, v14
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v15
; GFX9-NEXT: v_lshl_or_b32 v8, v19, 16, v16
; GFX9-NEXT: v_lshl_or_b32 v7, s4, 16, v15
; GFX9-NEXT: v_lshl_or_b32 v9, s4, 16, v17
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX9-NEXT: v_pk_add_u16 v6, v17, v16
; GFX9-NEXT: v_pk_add_u16 v0, v7, v8
; GFX9-NEXT: s_nop 0
; GFX9-NEXT: v_pk_add_u16 v0, v6, v8
; GFX9-NEXT: v_pk_add_u16 v1, v7, v9
; GFX9-NEXT: global_store_short v[4:5], v0, off offset:16
; GFX9-NEXT: global_store_short_d16_hi v[4:5], v0, off offset:18
; GFX9-NEXT: global_store_short v[4:5], v6, off offset:20
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:20
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
%a = load <11 x i16>, ptr addrspace(1) %ptra, align 4

View File

@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; FIXME: regbankcombiner regression, related to:
; - looking through copy and splitting G_CONSTANT i64 to two i32 constants
; - s_xor_b32 instead of s_not_b32, missing s16 pattern
define amdgpu_ps i32 @s_andn2_i32(i32 inreg %src0, i32 inreg %src1) {
; GCN-LABEL: s_andn2_i32:
@ -241,15 +245,19 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfi_b32 v0, v2, 0, v0
; GCN-NEXT: v_bfi_b32 v1, v3, 0, v1
; GCN-NEXT: v_xor_b32_e32 v2, -1, v2
; GCN-NEXT: v_xor_b32_e32 v3, -1, v3
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_andn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, 0, v0
; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, 0, v1
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3
; GFX10PLUS-NEXT: v_and_b32_e32 v0, v0, v2
; GFX10PLUS-NEXT: v_and_b32_e32 v1, v1, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
@ -259,14 +267,18 @@ define i64 @v_andn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_andn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_andn2_i64_sv:
; GCN: ; %bb.0:
; GCN-NEXT: v_bfi_b32 v0, v0, 0, s2
; GCN-NEXT: v_bfi_b32 v1, v1, 0, s3
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: v_and_b32_e32 v0, s2, v0
; GCN-NEXT: v_and_b32_e32 v1, s3, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i64_sv:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, 0, s2
; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, 0, s3
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s2, v0
; GFX10PLUS-NEXT: v_and_b32_e32 v1, s3, v1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
@ -278,16 +290,28 @@ define amdgpu_ps <2 x float> @v_andn2_i64_vs(i64 %src0, i64 inreg %src1) {
; GCN-LABEL: v_andn2_i64_vs:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b64 s[0:1], s[2:3]
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v1, s1, v1
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_and_b32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v1, v1, v3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i64_vs:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_not_b64 s[0:1], s[2:3]
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10PLUS-NEXT: v_and_b32_e32 v1, s1, v1
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: v_andn2_i64_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_not_b64 s[0:1], s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_and_b32_e32 v1, v1, v3
; GFX10-NEXT: v_and_b32_e32 v0, v0, v2
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_andn2_i64_vs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_not_b64 s[0:1], s[2:3]
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_and_b32_e32 v1, v1, v3
; GFX11-NEXT: v_and_b32_e32 v0, v0, v2
; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%and = and i64 %src0, %not.src1
%cast = bitcast i64 %and to <2 x float>
@ -377,20 +401,20 @@ define amdgpu_ps i16 @s_andn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps { i16, i16 } @s_andn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
; GCN-LABEL: s_andn2_i16_multi_use:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b32 s1, s3
; GCN-NEXT: s_xor_b32 s1, s3, -1
; GCN-NEXT: s_andn2_b32 s0, s2, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_andn2_i16_multi_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_andn2_b32 s0, s2, s3
; GFX10-NEXT: s_not_b32 s1, s3
; GFX10-NEXT: s_xor_b32 s1, s3, -1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_andn2_i16_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_not1_b32 s0, s2, s3
; GFX11-NEXT: s_not_b32 s1, s3
; GFX11-NEXT: s_xor_b32 s1, s3, -1
; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%and = and i16 %src0, %not.src1
@ -468,14 +492,14 @@ define amdgpu_ps float @v_andn2_i16_sv(i16 inreg %src0, i16 %src1) {
define amdgpu_ps float @v_andn2_i16_vs(i16 %src0, i16 inreg %src1) {
; GCN-LABEL: v_andn2_i16_vs:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b32 s0, s2
; GCN-NEXT: s_xor_b32 s0, s2, -1
; GCN-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_andn2_i16_vs:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_not_b32 s0, s2
; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1
; GFX10PLUS-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10PLUS-NEXT: ; return to shader part epilog

View File

@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
; FIXME: regbankcombiner regression, related to:
; - looking through copy and splitting G_CONSTANT i64 to two i32 constants
; - s_xor_b32 instead of s_not_b32, missing s16 pattern
define amdgpu_ps i32 @s_orn2_i32(i32 inreg %src0, i32 inreg %src1) {
; GCN-LABEL: s_orn2_i32:
@ -241,15 +245,19 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
; GCN-LABEL: v_orn2_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfi_b32 v0, v2, v0, -1
; GCN-NEXT: v_bfi_b32 v1, v3, v1, -1
; GCN-NEXT: v_xor_b32_e32 v2, -1, v2
; GCN-NEXT: v_xor_b32_e32 v3, -1, v3
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10PLUS-LABEL: v_orn2_i64:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_bfi_b32 v0, v2, v0, -1
; GFX10PLUS-NEXT: v_bfi_b32 v1, v3, v1, -1
; GFX10PLUS-NEXT: v_xor_b32_e32 v2, -1, v2
; GFX10PLUS-NEXT: v_xor_b32_e32 v3, -1, v3
; GFX10PLUS-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10PLUS-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
@ -259,14 +267,18 @@ define i64 @v_orn2_i64(i64 %src0, i64 %src1) {
define amdgpu_ps <2 x float> @v_orn2_i64_sv(i64 inreg %src0, i64 %src1) {
; GCN-LABEL: v_orn2_i64_sv:
; GCN: ; %bb.0:
; GCN-NEXT: v_bfi_b32 v0, v0, s2, -1
; GCN-NEXT: v_bfi_b32 v1, v1, s3, -1
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: v_or_b32_e32 v0, s2, v0
; GCN-NEXT: v_or_b32_e32 v1, s3, v1
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i64_sv:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: v_bfi_b32 v0, v0, s2, -1
; GFX10PLUS-NEXT: v_bfi_b32 v1, v1, s3, -1
; GFX10PLUS-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX10PLUS-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s2, v0
; GFX10PLUS-NEXT: v_or_b32_e32 v1, s3, v1
; GFX10PLUS-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
@ -278,16 +290,28 @@ define amdgpu_ps <2 x float> @v_orn2_i64_vs(i64 %src0, i64 inreg %src1) {
; GCN-LABEL: v_orn2_i64_vs:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b64 s[0:1], s[2:3]
; GCN-NEXT: v_or_b32_e32 v0, s0, v0
; GCN-NEXT: v_or_b32_e32 v1, s1, v1
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i64_vs:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_not_b64 s[0:1], s[2:3]
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10PLUS-NEXT: v_or_b32_e32 v1, s1, v1
; GFX10PLUS-NEXT: ; return to shader part epilog
; GFX10-LABEL: v_orn2_i64_vs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_not_b64 s[0:1], s[2:3]
; GFX10-NEXT: v_mov_b32_e32 v3, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: v_orn2_i64_vs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_not_b64 s[0:1], s[2:3]
; GFX11-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-NEXT: v_or_b32_e32 v0, v0, v2
; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i64 %src1, -1
%or = or i64 %src0, %not.src1
%cast = bitcast i64 %or to <2 x float>
@ -377,20 +401,20 @@ define amdgpu_ps i16 @s_orn2_i16_commute(i16 inreg %src0, i16 inreg %src1) {
define amdgpu_ps { i16, i16 } @s_orn2_i16_multi_use(i16 inreg %src0, i16 inreg %src1) {
; GCN-LABEL: s_orn2_i16_multi_use:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b32 s1, s3
; GCN-NEXT: s_xor_b32 s1, s3, -1
; GCN-NEXT: s_orn2_b32 s0, s2, s3
; GCN-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_orn2_i16_multi_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_orn2_b32 s0, s2, s3
; GFX10-NEXT: s_not_b32 s1, s3
; GFX10-NEXT: s_xor_b32 s1, s3, -1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_orn2_i16_multi_use:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_or_not1_b32 s0, s2, s3
; GFX11-NEXT: s_not_b32 s1, s3
; GFX11-NEXT: s_xor_b32 s1, s3, -1
; GFX11-NEXT: ; return to shader part epilog
%not.src1 = xor i16 %src1, -1
%or = or i16 %src0, %not.src1
@ -468,14 +492,14 @@ define amdgpu_ps float @v_orn2_i16_sv(i16 inreg %src0, i16 %src1) {
define amdgpu_ps float @v_orn2_i16_vs(i16 %src0, i16 inreg %src1) {
; GCN-LABEL: v_orn2_i16_vs:
; GCN: ; %bb.0:
; GCN-NEXT: s_not_b32 s0, s2
; GCN-NEXT: s_xor_b32 s0, s2, -1
; GCN-NEXT: v_or_b32_e32 v0, s0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX10PLUS-LABEL: v_orn2_i16_vs:
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_not_b32 s0, s2
; GFX10PLUS-NEXT: s_xor_b32 s0, s2, -1
; GFX10PLUS-NEXT: v_or_b32_e32 v0, s0, v0
; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX10PLUS-NEXT: ; return to shader part epilog

View File

@ -1,10 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX900 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX900 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX906 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10 %s
; FIXME: regbankcombiner regression, related to:
; - looking through copy and splitting G_CONSTANT i64 to two i32 constants
define amdgpu_ps i32 @scalar_xnor_i32_one_use(i32 inreg %a, i32 inreg %b) {
; GCN-LABEL: scalar_xnor_i32_one_use:
@ -221,45 +224,20 @@ entry:
}
define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) {
; GFX7-LABEL: vector_xnor_i64_one_use:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: vector_xnor_i64_one_use:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX900-LABEL: vector_xnor_i64_one_use:
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX900-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX900-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX900-NEXT: v_not_b32_e32 v0, v0
; GFX900-NEXT: v_not_b32_e32 v1, v1
; GFX900-NEXT: s_setpc_b64 s[30:31]
;
; GFX906-LABEL: vector_xnor_i64_one_use:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX906-NEXT: v_xnor_b32_e32 v0, v0, v2
; GFX906-NEXT: v_xnor_b32_e32 v1, v1, v3
; GFX906-NEXT: s_setpc_b64 s[30:31]
; GCN-LABEL: vector_xnor_i64_one_use:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_xor_b32_e32 v0, v0, v2
; GCN-NEXT: v_xor_b32_e32 v1, v1, v3
; GCN-NEXT: v_xor_b32_e32 v0, -1, v0
; GCN-NEXT: v_xor_b32_e32 v1, -1, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: vector_xnor_i64_one_use:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xnor_b32_e32 v0, v0, v2
; GFX10-NEXT: v_xnor_b32_e32 v1, v1, v3
; GFX10-NEXT: v_xor3_b32 v0, v0, v2, -1
; GFX10-NEXT: v_xor3_b32 v1, v1, v3, -1
; GFX10-NEXT: s_setpc_b64 s[30:31]
entry:
%xor = xor i64 %a, %b
@ -341,8 +319,8 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 29
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: xnor_i64_s_v_one_use:
@ -350,8 +328,8 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX900-LABEL: xnor_i64_s_v_one_use:
@ -359,22 +337,24 @@ define amdgpu_ps <2 x float> @xnor_i64_s_v_one_use(i64 inreg %a, i64 %b64) {
; GFX900-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX900-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX900-NEXT: v_not_b32_e32 v0, v0
; GFX900-NEXT: v_not_b32_e32 v1, v1
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX900-NEXT: ; return to shader part epilog
;
; GFX906-LABEL: xnor_i64_s_v_one_use:
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX906-NEXT: v_xnor_b32_e32 v0, s0, v0
; GFX906-NEXT: v_xnor_b32_e32 v1, s1, v1
; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX906-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX906-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX906-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: xnor_i64_s_v_one_use:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX10-NEXT: v_xnor_b32_e32 v0, s0, v0
; GFX10-NEXT: v_xnor_b32_e32 v1, s1, v1
; GFX10-NEXT: v_xor3_b32 v0, s0, v0, -1
; GFX10-NEXT: v_xor3_b32 v1, s1, v1, -1
; GFX10-NEXT: ; return to shader part epilog
entry:
%b = shl i64 %b64, 29
@ -390,8 +370,8 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 29
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX7-NEXT: v_not_b32_e32 v0, v0
; GFX7-NEXT: v_not_b32_e32 v1, v1
; GFX7-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX7-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: xnor_i64_v_s_one_use:
@ -399,8 +379,8 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
; GFX8-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-NEXT: v_not_b32_e32 v0, v0
; GFX8-NEXT: v_not_b32_e32 v1, v1
; GFX8-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX900-LABEL: xnor_i64_v_s_one_use:
@ -408,22 +388,24 @@ define amdgpu_ps <2 x float> @xnor_i64_v_s_one_use(i64 inreg %a, i64 %b64) {
; GFX900-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX900-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX900-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX900-NEXT: v_not_b32_e32 v0, v0
; GFX900-NEXT: v_not_b32_e32 v1, v1
; GFX900-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX900-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX900-NEXT: ; return to shader part epilog
;
; GFX906-LABEL: xnor_i64_v_s_one_use:
; GFX906: ; %bb.0:
; GFX906-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX906-NEXT: v_xnor_b32_e64 v0, v0, s0
; GFX906-NEXT: v_xnor_b32_e64 v1, v1, s1
; GFX906-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX906-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX906-NEXT: v_xor_b32_e32 v0, -1, v0
; GFX906-NEXT: v_xor_b32_e32 v1, -1, v1
; GFX906-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: xnor_i64_v_s_one_use:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_lshlrev_b64 v[0:1], 29, v[0:1]
; GFX10-NEXT: v_xnor_b32_e64 v0, v0, s0
; GFX10-NEXT: v_xnor_b32_e64 v1, v1, s1
; GFX10-NEXT: v_xor3_b32 v0, v0, s0, -1
; GFX10-NEXT: v_xor3_b32 v1, v1, s1, -1
; GFX10-NEXT: ; return to shader part epilog
%b = shl i64 %b64, 29
%xor = xor i64 %b, %a

View File

@ -1,24 +1,33 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 < %s | FileCheck -check-prefixes=GFX6,GFX6-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 < %s | FileCheck -check-prefixes=GFX6,GFX6-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 < %s | FileCheck -check-prefixes=GFX6,GFX6-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16 enable gisel
; XUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
;FIXME: add regbankcombine for
; %val = ...
; %val1:vgpr(s32) = G_AND %val, 1
; %cmp:vcc(s1) = G_ICMP intpred(ne), %val1(s32), 0
; %res:vgpr(s32) = G_SELECT %cmp(s1), 1, 0
; ->
; %res:vgpr(s32) = G_AND %val, 1
define void @freeze_v2i32(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-SDAG-LABEL: freeze_v2i32:
@ -13228,6 +13237,8 @@ define void @freeze_i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX6-GISEL-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX6-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX6-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX6-GISEL-NEXT: buffer_store_byte v0, v[2:3], s[4:7], 0 addr64
; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -13255,47 +13266,75 @@ define void @freeze_i1(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) {
; GFX7-GISEL-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX7-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-GISEL-NEXT: buffer_store_byte v0, v[2:3], s[4:7], 0 addr64
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: freeze_i1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-NEXT: flat_store_byte v[2:3], v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
; GFX8-GISEL-LABEL: freeze_i1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: flat_load_ubyte v0, v[0:1]
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX8-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX8-GISEL-NEXT: flat_store_byte v[2:3], v0
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: freeze_i1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: global_store_byte v[2:3], v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-GISEL-LABEL: freeze_i1:
; GFX9-GISEL: ; %bb.0:
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-GISEL-NEXT: global_store_byte v[2:3], v0, off
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: freeze_i1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: global_store_byte v[2:3], v0, off
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX10-SDAG-LABEL: freeze_i1:
; GFX10-SDAG: ; %bb.0:
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-SDAG-NEXT: global_store_byte v[2:3], v0, off
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: freeze_i1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: global_store_b8 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
; GFX10-GISEL-LABEL: freeze_i1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX10-GISEL-NEXT: global_store_byte v[2:3], v0, off
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: freeze_i1:
; GFX11-SDAG: ; %bb.0:
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-SDAG-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-SDAG-NEXT: global_store_b8 v[2:3], v0, off
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: freeze_i1:
; GFX11-GISEL: ; %bb.0:
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: global_load_u8 v0, v[0:1], off
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-GISEL-NEXT: global_store_b8 v[2:3], v0, off
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
%a = load i1, ptr addrspace(1) %ptra
%freeze = freeze i1 %a
store i1 %freeze, ptr addrspace(1) %ptrb

View File

@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL-GFX11,GISEL-GFX11-FAKE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefix=GISEL-GFX10 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=+wavefrontsize32 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF32,DAGISEL-GFX11-WF32-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -stop-after=finalize-isel < %s | FileCheck -check-prefixes=DAGISEL-GFX11-WF64,DAGISEL-GFX11-WF64-TRUE16 %s

View File

@ -1,21 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; FIXME: codegen regression, related to:
; - looking through sgpr to vgpr copy
; - there are some differences for true16 and fake16
; selects v_alignbit_b32 instead of v_lshrrev_b32_e32
; v_alignbit_b32 + v_and_b32_e32 instead of v_and_b32_sdwa
define i8 @test_vector_reduce_and_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_and_v2i8:
@ -994,31 +1000,19 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX8-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_and_v2i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
; GFX9-LABEL: test_vector_reduce_and_v2i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_and_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_and_v2i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_and_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
; GFX10-LABEL: test_vector_reduce_and_v2i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
@ -1041,7 +1035,7 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_and_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1079,7 +1073,7 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1239,19 +1233,35 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) {
; GFX8-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_vector_reduce_and_v4i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-SDAG-LABEL: test_vector_reduce_and_v4i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_and_v4i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX9-GISEL-LABEL: test_vector_reduce_and_v4i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_and_v4i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_and_v4i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v4i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
@ -1277,7 +1287,7 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@ -1317,7 +1327,7 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@ -1394,7 +1404,8 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_and_v8i16:
@ -1412,7 +1423,8 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v8i16:
@ -1447,7 +1459,7 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1496,7 +1508,7 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1614,7 +1626,8 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_and_v16i16:
@ -1640,7 +1653,8 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v16i16:
@ -1690,7 +1704,7 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1754,7 +1768,7 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_and_b32_e32 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_and_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]

View File

@ -1,21 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; FIXME: codegen regression, related to:
; - looking through sgpr to vgpr copy
; - there are some differences for true16 and fake16
; selects v_alignbit_b32 instead of v_lshrrev_b32_e32
; v_alignbit_b32 + v_or_b32_e32 instead of v_or_b32_sdwa
define i8 @test_vector_reduce_or_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_or_v2i8:
@ -1017,31 +1023,19 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_or_v2i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
; GFX9-LABEL: test_vector_reduce_or_v2i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_or_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_or_v2i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_or_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
; GFX10-LABEL: test_vector_reduce_or_v2i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
@ -1064,7 +1058,7 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_or_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1102,7 +1096,7 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1260,19 +1254,35 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) {
; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_vector_reduce_or_v4i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-SDAG-LABEL: test_vector_reduce_or_v4i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_or_v4i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX9-GISEL-LABEL: test_vector_reduce_or_v4i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_or_v4i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_or_v4i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v4i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
@ -1298,7 +1308,7 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@ -1338,7 +1348,7 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@ -1414,7 +1424,8 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_or_v8i16:
@ -1431,7 +1442,8 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v8i16:
@ -1464,7 +1476,7 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1511,7 +1523,7 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1627,7 +1639,8 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_or_v16i16:
@ -1651,7 +1664,8 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v16i16:
@ -1695,7 +1709,7 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1753,7 +1767,7 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]

View File

@ -1,21 +1,27 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GFX8,GFX8-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s
; FIXME-TRUE16. enable gisel
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; XUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
; FIXME: codegen regression, related to:
; - looking through sgpr to vgpr copy
; - there are some differences for true16 and fake16
; selects v_alignbit_b32 instead of v_lshrrev_b32_e32
; v_alignbit_b32 + v_xor_b32_e32 instead of v_xor_b32_sdwa
define i8 @test_vector_reduce_xor_v2i8(<2 x i8> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_xor_v2i8:
@ -963,31 +969,19 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: test_vector_reduce_xor_v2i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
; GFX9-LABEL: test_vector_reduce_xor_v2i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: test_vector_reduce_xor_v2i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_xor_v2i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_xor_v2i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
; GFX10-LABEL: test_vector_reduce_xor_v2i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
@ -1010,7 +1004,7 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX11-GISEL-LABEL: test_vector_reduce_xor_v2i16:
; GFX11-GISEL: ; %bb.0: ; %entry
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1048,7 +1042,7 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1206,19 +1200,35 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) {
; GFX8-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: test_vector_reduce_xor_v4i16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
; GFX9-SDAG-LABEL: test_vector_reduce_xor_v4i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: test_vector_reduce_xor_v4i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: s_setpc_b64 s[30:31]
; GFX9-GISEL-LABEL: test_vector_reduce_xor_v4i16:
; GFX9-GISEL: ; %bb.0: ; %entry
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_xor_v4i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: test_vector_reduce_xor_v4i16:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v4i16:
; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry
@ -1244,7 +1254,7 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) {
; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@ -1284,7 +1294,7 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
@ -1361,7 +1371,8 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_xor_v8i16:
@ -1378,7 +1389,8 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v8i16:
@ -1411,7 +1423,7 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1458,7 +1470,7 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) {
; GFX12-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1576,7 +1588,8 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX9-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: test_vector_reduce_xor_v16i16:
@ -1600,7 +1613,8 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-GISEL-NEXT: v_alignbit_b32 v1, s4, v0, 16
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v16i16:
@ -1644,7 +1658,7 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
; GFX11-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
@ -1702,7 +1716,7 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) {
; GFX12-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX12-GISEL-NEXT: v_alignbit_b32 v1, s0, v0, 16
; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]