[AMDGPU] Convert tests to opaque pointers (NFC)

This commit is contained in:
Nikita Popov 2024-02-05 12:41:37 +01:00
parent 1d3d8936ba
commit 00a4e248dc
26 changed files with 634 additions and 634 deletions

View File

@ -2,7 +2,7 @@
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s
define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_add_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
@ -35,11 +35,11 @@ define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GFX12-NEXT: s_endpgm
entry:
%add = add i64 %a, %b
store i64 %add, i64 addrspace(1)* %out
store i64 %add, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GCN-LABEL: v_add_u64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
@ -50,11 +50,11 @@ define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GCN-NEXT: s_endpgm
entry:
%add = add i64 %a, %b
store i64 %add, i64 addrspace(1)* %out
store i64 %add, ptr addrspace(1) %out
ret void
}
define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_sub_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
@ -87,11 +87,11 @@ define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GFX12-NEXT: s_endpgm
entry:
%sub = sub i64 %a, %b
store i64 %sub, i64 addrspace(1)* %out
store i64 %sub, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GCN-LABEL: v_sub_u64:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: v_sub_co_u32 v2, vcc_lo, v2, v4
@ -102,6 +102,6 @@ define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
; GCN-NEXT: s_endpgm
entry:
%sub = sub i64 %a, %b
store i64 %sub, i64 addrspace(1)* %out
store i64 %sub, ptr addrspace(1) %out
ret void
}

View File

@ -327,8 +327,8 @@ body: |
%ptr2:_(p1) = G_IMPLICIT_DEF
%ptr3:_(p1) = COPY $vgpr2_vgpr3
%ptr4:_(p1) = COPY $vgpr4_vgpr5
G_STORE %src1:_(s32), %ptr1:_(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
G_STORE %src2:_(s32), %ptr2:_(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
G_STORE %src1:_(s32), %ptr1:_(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
G_STORE %src2:_(s32), %ptr2:_(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
%div:_(s32) = G_SDIV %src1:_(s32), %src2:_(s32)
G_STORE %div:_(s32), %ptr3:_(p1) :: (store (s32), addrspace 1, align 4)
%rem:_(s32) = G_SREM %src1:_(s32), %src2:_(s32)

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -14,11 +14,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -31,11 +31,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -48,11 +48,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -65,11 +65,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -82,11 +82,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -99,11 +99,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -114,11 +114,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -129,11 +129,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
@ -144,11 +144,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
bb:
%fneg.C = fneg <8 x half> %C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
@ -159,11 +159,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -176,11 +176,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -193,11 +193,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -210,11 +210,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -227,11 +227,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -244,11 +244,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -261,11 +261,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -278,11 +278,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -295,11 +295,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -329,11 +329,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -344,11 +344,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -359,13 +359,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -379,11 +379,11 @@ bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%fneg.fabs.C = fneg <8 x float> %fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -395,11 +395,11 @@ bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%fneg.fabs.C = fneg <8 x half> %fabs.C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
@ -417,13 +417,13 @@ bb:
%partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -436,11 +436,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -451,13 +451,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_clause 0x1
@ -485,7 +485,7 @@ bb:
%C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%fneg.C_shuffle = fneg <8 x half> %C_shuffle
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -16,7 +16,7 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -27,11 +27,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -42,11 +42,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -57,11 +57,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -72,11 +72,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -87,11 +87,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -102,11 +102,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -117,11 +117,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
@ -132,11 +132,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
bb:
%fneg.C = fneg <4 x half> %C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
@ -147,11 +147,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -162,11 +162,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -177,11 +177,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -192,11 +192,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -207,11 +207,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -222,11 +222,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -237,11 +237,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -252,11 +252,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -267,11 +267,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -282,11 +282,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -297,11 +297,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -327,13 +327,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -345,11 +345,11 @@ bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%fneg.fabs.C = fneg <4 x float> %fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -361,11 +361,11 @@ bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%fneg.fabs.C = fneg <4 x half> %fabs.C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
@ -381,13 +381,13 @@ bb:
%partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -398,11 +398,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -413,13 +413,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
@ -440,7 +440,7 @@ bb:
%C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%fneg.C_shuffle = fneg <4 x half> %C_shuffle
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}

View File

@ -2574,7 +2574,7 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
ret void
}
define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store_v3bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@ -2637,7 +2637,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
; GFX11-NEXT: global_store_b16 v[2:3], v1, off offset:4
; GFX11-NEXT: global_store_b32 v[2:3], v0, off
; GFX11-NEXT: s_setpc_b64 s[30:31]
store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
}

View File

@ -15,25 +15,25 @@
--- |
define amdgpu_kernel void @long_branch_dbg_value(float addrspace(1)* nocapture %arg, float %arg1) #1 !dbg !5 {
define amdgpu_kernel void @long_branch_dbg_value(ptr addrspace(1) nocapture %arg, float %arg1) #1 !dbg !5 {
bb:
%long_branch_dbg_value.kernarg.segment = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
%arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %long_branch_dbg_value.kernarg.segment, i64 0
%arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to float addrspace(1)* addrspace(4)*, !amdgpu.uniform !2, !amdgpu.noclobber !2
%arg.load = load float addrspace(1)*, float addrspace(1)* addrspace(4)* %arg.kernarg.offset.cast, align 16, !invariant.load !2
%arg1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %long_branch_dbg_value.kernarg.segment, i64 8
%arg1.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg1.kernarg.offset to float addrspace(4)*, !amdgpu.uniform !2, !amdgpu.noclobber !2
%arg1.load = load float, float addrspace(4)* %arg1.kernarg.offset.cast, align 8, !invariant.load !2
%long_branch_dbg_value.kernarg.segment = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
%arg.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_dbg_value.kernarg.segment, i64 0
%arg.kernarg.offset.cast = bitcast ptr addrspace(4) %arg.kernarg.offset to ptr addrspace(4), !amdgpu.uniform !2, !amdgpu.noclobber !2
%arg.load = load ptr addrspace(1), ptr addrspace(4) %arg.kernarg.offset.cast, align 16, !invariant.load !2
%arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_dbg_value.kernarg.segment, i64 8
%arg1.kernarg.offset.cast = bitcast ptr addrspace(4) %arg1.kernarg.offset to ptr addrspace(4), !amdgpu.uniform !2, !amdgpu.noclobber !2
%arg1.load = load float, ptr addrspace(4) %arg1.kernarg.offset.cast, align 8, !invariant.load !2
%tmp = fmul float %arg1.load, %arg1.load
%tmp2 = getelementptr inbounds float, float addrspace(1)* %arg.load, i64 3
call void @llvm.dbg.value(metadata float addrspace(1)* %tmp2, metadata !11, metadata !DIExpression()) #5, !dbg !12
store float %tmp, float addrspace(1)* %tmp2, align 4, !dbg !12
%tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg.load, i64 3
call void @llvm.dbg.value(metadata ptr addrspace(1) %tmp2, metadata !11, metadata !DIExpression()) #5, !dbg !12
store float %tmp, ptr addrspace(1) %tmp2, align 4, !dbg !12
%tmp3 = fcmp olt float %tmp, 0x3810000000000000
%tmp3.inv = xor i1 %tmp3, true
br i1 %tmp3.inv, label %bb4, label %bb8, !amdgpu.uniform !2
bb4: ; preds = %bb
%tmp5 = load volatile float, float addrspace(1)* undef, align 4
%tmp5 = load volatile float, ptr addrspace(1) undef, align 4
%tmp6 = fcmp oeq float %tmp5, 0x7FF0000000000000
br i1 %tmp6, label %bb7, label %Flow, !amdgpu.uniform !2
@ -47,7 +47,7 @@
ret void
}
declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2
declare void @llvm.dbg.value(metadata, metadata, metadata) #0
attributes #0 = { nounwind readnone speculatable willreturn }
@ -103,7 +103,7 @@ body: |
renamable $sgpr4_sgpr5 = IMPLICIT_DEF
$vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5
$vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $exec
renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile load (s32) from `float addrspace(1)* undef`, addrspace 1)
renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1)
renamable $sgpr4 = S_MOV_B32 2139095040
S_WAITCNT 3952
renamable $sgpr4_sgpr5 = nofpexcept V_CMP_NEQ_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec

View File

@ -1385,7 +1385,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias
; The other use of shuffle0_0 make it profitable to lower into v_perm
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out1, <4 x i8> addrspace(1)* noalias %in, <4 x i8> addrspace(1)* noalias %in1) nounwind {
define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
@ -1547,14 +1547,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
%gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in1, i32 %tid
%load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
%load1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1, align 1
%gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
%gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
%load = load <4 x i8>, ptr addrspace(1) %gep, align 1
%load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1
%shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
%cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out1, align 4
store <4 x float> %cvt, ptr addrspace(1) %out, align 16
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4
ret void
}

View File

@ -2,7 +2,7 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
--- |
define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void }
define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void }
!0 = distinct !{!0}
!1 = !{!1, !0}

View File

@ -2,7 +2,7 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s
--- |
define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void }
define amdgpu_kernel void @single-wave-phase-2c(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17) #0 { ret void }
!0 = distinct !{!0}

View File

@ -176,10 +176,10 @@ define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %s
; SI-NOT: v_rsq_f64_e32
; SI: v_sqrt_f64
; SI: v_rcp_f64
define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
%sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
%rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
store double %rcp, double addrspace(1)* %out, align 8
store double %rcp, ptr addrspace(1) %out, align 8
ret void
}
@ -195,10 +195,10 @@ define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)*
; SI: v_fma_f64
; SI: v_rcp_f64
; SI: buffer_store_dwordx2
define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
%sqrt = call double @llvm.sqrt.f64(double %src)
%rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
store double %rcp, double addrspace(1)* %out, align 8
store double %rcp, ptr addrspace(1) %out, align 8
ret void
}

View File

@ -82,8 +82,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in
store <9 x float> %tmp0, <9 x float> addrspace(1)* %out
%tmp0 = load <9 x float>, ptr addrspace(1) %in
store <9 x float> %tmp0, ptr addrspace(1) %out
ret void
}
@ -101,8 +101,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in
store <10 x float> %tmp0, <10 x float> addrspace(1)* %out
%tmp0 = load <10 x float>, ptr addrspace(1) %in
store <10 x float> %tmp0, ptr addrspace(1) %out
ret void
}
@ -122,8 +122,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in
store <11 x float> %tmp0, <11 x float> addrspace(1)* %out
%tmp0 = load <11 x float>, ptr addrspace(1) %in
store <11 x float> %tmp0, ptr addrspace(1) %out
ret void
}
@ -140,8 +140,8 @@ entry:
; R600: VTX_READ_128
define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
entry:
%tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in
store <12 x float> %tmp0, <12 x float> addrspace(1)* %out
%tmp0 = load <12 x float>, ptr addrspace(1) %in
store <12 x float> %tmp0, ptr addrspace(1) %out
ret void
}

View File

@ -6,13 +6,13 @@
; Check a constructor that's an alias, and an integer literal.
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [
{ i32, ptr, ptr } { i32 1, ptr @foo.alias, i8* null },
{ i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), i8* null }
{ i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null },
{ i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }
]
; Check a constantexpr addrspacecast
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [
{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), i8* null }
{ i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }
]
@foo.alias = hidden alias void (), ptr @foo

View File

@ -1,30 +1,30 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer %s -o - | FileCheck %s
--- |
define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
define amdgpu_kernel void @multiple_mem_operands(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
entry:
%scratch0 = alloca [8192 x i32], addrspace(5)
%scratch1 = alloca [8192 x i32], addrspace(5)
%scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)*
store i32 1, i32 addrspace(5)* %scratchptr01
%scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)*
store i32 2, i32 addrspace(5)* %scratchptr12
%scratchptr01 = bitcast ptr addrspace(5) %scratch0 to ptr addrspace(5)
store i32 1, ptr addrspace(5) %scratchptr01
%scratchptr12 = bitcast ptr addrspace(5) %scratch1 to ptr addrspace(5)
store i32 2, ptr addrspace(5) %scratchptr12
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
if: ; preds = %entry
%if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
%if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1
%if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
%if_value = load i32, ptr addrspace(5) %if_ptr, align 4, !nontemporal !1
br label %done, !structurizecfg.uniform !0
else: ; preds = %entry
%else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
%else_value = load i32, i32 addrspace(5)* %else_ptr, align 4, !nontemporal !1
%else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
%else_value = load i32, ptr addrspace(5) %else_ptr, align 4, !nontemporal !1
br label %done, !structurizecfg.uniform !0
done: ; preds = %else, %if
%value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
store i32 %value, i32 addrspace(1)* %out
store i32 %value, ptr addrspace(1) %out
ret void
}
@ -110,9 +110,9 @@ body: |
successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
liveins: $sgpr0_sgpr1, $sgpr3
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
$sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
$sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
$sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
@ -130,7 +130,7 @@ body: |
successors: %bb.3.done(0x80000000)
liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
S_WAITCNT 3855
$vgpr0 = V_MOV_B32_e32 32772, implicit $exec
S_BRANCH %bb.3.done
@ -139,7 +139,7 @@ body: |
successors: %bb.3.done(0x80000000)
liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
S_WAITCNT 3855
$vgpr0 = V_MOV_B32_e32 4, implicit $exec

View File

@ -2,30 +2,30 @@
--- |
define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
define amdgpu_kernel void @multiple_mem_operands(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
entry:
%scratch0 = alloca [8192 x i32], addrspace(5)
%scratch1 = alloca [8192 x i32], addrspace(5)
%scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)*
store i32 1, i32 addrspace(5)* %scratchptr01
%scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)*
store i32 2, i32 addrspace(5)* %scratchptr12
%scratchptr01 = bitcast ptr addrspace(5) %scratch0 to ptr addrspace(5)
store i32 1, ptr addrspace(5) %scratchptr01
%scratchptr12 = bitcast ptr addrspace(5) %scratch1 to ptr addrspace(5)
store i32 2, ptr addrspace(5) %scratchptr12
%cmp = icmp eq i32 %cond, 0
br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
if: ; preds = %entry
%if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
%if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1
%if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
%if_value = load i32, ptr addrspace(5) %if_ptr, align 4, !nontemporal !1
br label %done, !structurizecfg.uniform !0
else: ; preds = %entry
%else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
%else_value = load i32, i32 addrspace(5)* %else_ptr, align 4
%else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
%else_value = load i32, ptr addrspace(5) %else_ptr, align 4
br label %done, !structurizecfg.uniform !0
done: ; preds = %else, %if
%value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
store i32 %value, i32 addrspace(1)* %out
store i32 %value, ptr addrspace(1) %out
ret void
}
@ -90,9 +90,9 @@ body: |
successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
liveins: $sgpr0_sgpr1, $sgpr3
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
$sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
$sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
$sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
@ -110,7 +110,7 @@ body: |
successors: %bb.3.done(0x80000000)
liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
S_WAITCNT 3855
$vgpr0 = V_MOV_B32_e32 32772, implicit $exec
S_BRANCH %bb.3.done
@ -119,7 +119,7 @@ body: |
successors: %bb.3.done(0x80000000)
liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
$sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
S_WAITCNT 3855
$vgpr0 = V_MOV_B32_e32 4, implicit $exec

View File

@ -13,8 +13,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2
...
@ -32,9 +32,9 @@ body: |
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2, implicit %3
...
@ -54,10 +54,10 @@ body: |
; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
...
@ -78,11 +78,11 @@ body: |
; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[FLAT_LOAD_DWORD]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
...
@ -105,12 +105,12 @@ body: |
; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6
...
@ -126,8 +126,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4)
%2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4)
%1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 4)
%2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2
...
@ -143,8 +143,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `i128* undef`, align 8)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr undef`, align 8)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2
...
@ -160,8 +160,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub1_sub2
; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]]
%0:vreg_64_align2 = IMPLICIT_DEF
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 8)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 8)
S_NOP 0, implicit %1, implicit %2
...
@ -176,8 +176,8 @@ body: |
; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2
...
@ -192,8 +192,8 @@ body: |
; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2
...
@ -208,8 +208,8 @@ body: |
; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
S_NOP 0, implicit %1, implicit %2
...
@ -224,8 +224,8 @@ body: |
; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
S_NOP 0, implicit %1, implicit %2
...
@ -243,8 +243,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -264,9 +264,9 @@ body: |
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -283,10 +283,10 @@ body: |
; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr undef`, align 4)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_128 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -312,11 +312,11 @@ body: |
%3:agpr_32 = IMPLICIT_DEF
%4:agpr_32 = IMPLICIT_DEF
%5:agpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8)
FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 8)
FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -345,12 +345,12 @@ body: |
%4:vgpr_32 = IMPLICIT_DEF
%5:vgpr_32 = IMPLICIT_DEF
%6:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8)
FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 8)
FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -367,8 +367,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_64_align2 = IMPLICIT_DEF
%2:vreg_64_align2 = IMPLICIT_DEF
FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4)
FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4)
FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`, align 4)
FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`, align 4)
...
---
@ -385,8 +385,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_96_align2 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i64* undef`, align 16)
FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr undef`, align 16)
FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -403,8 +403,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:agpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -421,8 +421,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -439,8 +439,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 2)
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 2)
...
---
@ -457,8 +457,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...
---
@ -475,6 +475,6 @@ body: |
%0:vreg_128_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
...

View File

@ -13,8 +13,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 4, basealign 4)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4, basealign 4)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -30,8 +30,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, basealign 8, addrspace 1)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4, basealign 8)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, basealign 8, addrspace 1)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4, basealign 8)
S_NOP 0, implicit %1, implicit %2
...
@ -49,9 +49,9 @@ body: |
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0
; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 16)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, basealign 16)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 8, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -71,10 +71,10 @@ body: |
; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 12, basealign 8, addrspace 1)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 16)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 8, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 12, basealign 8, addrspace 1)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 16)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
...
@ -90,8 +90,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `double* undef`)
%2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1)
%1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`)
%2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -107,8 +107,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`)
%2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `<3 x i32> addrspace(1)* undef`, addrspace 1)
%1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
%2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `ptr addrspace(1) undef`, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -124,8 +124,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
%2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `<3 x i32>* undef`)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
%2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr undef`)
S_NOP 0, implicit %1, implicit %2
...
@ -144,9 +144,9 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:sreg_64_xexec = IMPLICIT_DEF
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 4)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
%2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, basealign 4)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3, implicit %4
...
@ -165,9 +165,9 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:sreg_64_xexec = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 8)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
%3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4)
%4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 8)
S_NOP 0, implicit %2, implicit %3, implicit %4
...
@ -184,8 +184,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
...
---
@ -201,8 +201,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
...
---
@ -218,8 +218,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_64_align2 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, addrspace 1)
...
---
@ -235,8 +235,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_96_align2 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `<3 x i32> addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `ptr addrspace(1) undef`, addrspace 1)
...
---
@ -252,8 +252,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_64_align2 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`)
GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`)
...
---
@ -269,8 +269,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_96_align2 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `<3 x i32>* undef`)
GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr undef`)
...
---
@ -288,8 +288,8 @@ body: |
%1:sreg_64_xexec = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
...
---
@ -307,6 +307,6 @@ body: |
%1:sreg_64_xexec = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
...

View File

@ -13,8 +13,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -32,9 +32,9 @@ body: |
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2, implicit %3
...
@ -54,10 +54,10 @@ body: |
; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
...
@ -78,11 +78,11 @@ body: |
; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[GLOBAL_LOAD_DWORD]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
...
@ -105,12 +105,12 @@ body: |
; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6
...
@ -126,8 +126,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
%2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
%1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -143,8 +143,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `i128 addrspace(1)* undef`, align 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `ptr addrspace(1) undef`, align 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -160,8 +160,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub1_sub2
; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]]
%0:vreg_64_align2 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 8, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -176,8 +176,8 @@ body: |
; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -192,8 +192,8 @@ body: |
; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -208,8 +208,8 @@ body: |
; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 3, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -224,8 +224,8 @@ body: |
; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -243,8 +243,8 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3
...
@ -264,9 +264,9 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3, implicit %4
...
@ -288,10 +288,10 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5
...
@ -316,12 +316,12 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
...
@ -339,8 +339,8 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
%3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
%2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3
...
@ -357,8 +357,8 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vreg_64_align2 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3
...
@ -375,8 +375,8 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]]
%0:sgpr_128 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3
...
@ -393,8 +393,8 @@ body: |
; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]]
%0:sreg_64_xexec = IMPLICIT_DEF
%1:vreg_64_align2 = IMPLICIT_DEF
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %2, implicit %3
...
@ -409,8 +409,8 @@ body: |
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub0
; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -428,9 +428,9 @@ body: |
; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0
; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]]
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 16, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, align 8, addrspace 1)
%1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1)
%2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 16, addrspace 1)
%3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, align 8, addrspace 1)
S_NOP 0, implicit %1, implicit %2
...
@ -449,8 +449,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -470,9 +470,9 @@ body: |
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 4, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %2, 8, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %1, 4, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %2, 8, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -489,10 +489,10 @@ body: |
; GCN-NEXT: GLOBAL_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec :: (store (s128) into `ptr addrspace(1) undef`, align 4, addrspace 1)
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_128 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -518,11 +518,11 @@ body: |
%3:agpr_32 = IMPLICIT_DEF
%4:agpr_32 = IMPLICIT_DEF
%5:agpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, %1, 4, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %2, 8, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 8, addrspace 1)
GLOBAL_STORE_DWORD %0, %3, 12, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %4, 16, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %5, 20, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1, 4, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %2, 8, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 8, addrspace 1)
GLOBAL_STORE_DWORD %0, %3, 12, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %4, 16, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %5, 20, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -551,12 +551,12 @@ body: |
%4:vgpr_32 = IMPLICIT_DEF
%5:vgpr_32 = IMPLICIT_DEF
%6:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 8, addrspace 1)
GLOBAL_STORE_DWORD %0, %2, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %3, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %4, 16, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %5, 20, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %6, 24, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %1, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 8, addrspace 1)
GLOBAL_STORE_DWORD %0, %2, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %3, 12, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %4, 16, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %5, 20, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, %6, 24, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -573,8 +573,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_64_align2 = IMPLICIT_DEF
%2:vreg_64_align2 = IMPLICIT_DEF
GLOBAL_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -591,8 +591,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vreg_96_align2 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec :: (store (s96) into `i64 addrspace(1)* undef`, align 16, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %2, 16, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec :: (store (s96) into `ptr addrspace(1) undef`, align 16, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %2, 16, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -609,8 +609,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:agpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -627,8 +627,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -645,8 +645,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 2, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 2, addrspace 1)
...
---
@ -663,8 +663,8 @@ body: |
%0:vreg_64_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0, killed %1, 0, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0, killed %1, 0, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -681,8 +681,8 @@ body: |
%0:vreg_128_align2 = IMPLICIT_DEF
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -701,8 +701,8 @@ body: |
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -724,9 +724,9 @@ body: |
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
%4:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -751,10 +751,10 @@ body: |
%3:vgpr_32 = IMPLICIT_DEF
%4:vgpr_32 = IMPLICIT_DEF
%5:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -785,12 +785,12 @@ body: |
%5:vgpr_32 = IMPLICIT_DEF
%6:vgpr_32 = IMPLICIT_DEF
%7:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -809,8 +809,8 @@ body: |
%1:vreg_64_align2 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %1, %3, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD %1, %3, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -829,8 +829,8 @@ body: |
%1:vreg_64_align2 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...
---
@ -849,6 +849,6 @@ body: |
%1:vgpr_32 = IMPLICIT_DEF
%2:vgpr_32 = IMPLICIT_DEF
%3:vgpr_32 = IMPLICIT_DEF
GLOBAL_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
GLOBAL_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
...

View File

@ -29,10 +29,10 @@
ret void
bb2:
%tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
%tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
%tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
%tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8
%tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16
%tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24
br label %bb1
}
@ -44,10 +44,10 @@
ret void
bb2:
%tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
%tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
%tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
%tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8
%tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16
%tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24
br label %bb1
}
@ -59,10 +59,10 @@
ret void
bb2:
%tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
%tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
%tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
%tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0
%tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8
%tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16
%tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24
br label %bb1
}
---

View File

@ -508,7 +508,7 @@ define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 {
; GFX12-NEXT: s_endpgm
%fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
%div2 = fmul float %fmed3, 2.0
store float %div2, float addrspace(1)* undef
store float %div2, ptr addrspace(1) undef
ret void
}

View File

@ -54,8 +54,8 @@ define amdgpu_kernel void @poison_interposable_initializer_gv(i32 %n) {
define amdgpu_kernel void @not_constant_gv(i32 %n) {
entry:
%str = alloca [9 x i8], align 1, addrspace(5)
%arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0
%call1 = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @not.constant, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n)
%arraydecay = getelementptr inbounds [9 x i8], ptr addrspace(5) %str, i32 0, i32 0
%call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) @not.constant, ptr addrspace(5) %arraydecay, i32 %n)
ret void
}

View File

@ -752,8 +752,8 @@ define amdgpu_kernel void @test_kernel_addrspacecasted_format_str(i32 %n) {
;
entry:
%str = alloca [9 x i8], align 1, addrspace(5)
%arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0
%call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) addrspacecast (i8 addrspace(1)* getelementptr inbounds ([6 x i8], ptr addrspace(1) @str.as1, i32 0, i32 0) to ptr addrspace(4)), ptr addrspace(5) %arraydecay, i32 %n)
%arraydecay = getelementptr inbounds [9 x i8], ptr addrspace(5) %str, i32 0, i32 0
%call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) addrspacecast (ptr addrspace(1) @str.as1 to ptr addrspace(4)), ptr addrspace(5) %arraydecay, i32 %n)
ret void
}

View File

@ -133,7 +133,7 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
store float 1.0, ptr addrspace(5) %foo1
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
store float 2.0, ptr addrspace(5) %foo2
call void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
%foo3 = load float, ptr addrspace(5) %f1
store float %foo3, ptr addrspace(1) @pv
ret void
@ -160,7 +160,7 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 {
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
%foo6 = load float, ptr addrspace(5) %f1
store float %foo6, ptr addrspace(1) @pv
ret void
@ -177,7 +177,7 @@ define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
store float 1.0, ptr addrspace(5) %foo1
%foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
store float 2.0, ptr addrspace(5) %foo2
call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
%foo3 = load float, ptr addrspace(5) %f1
store float %foo3, ptr addrspace(1) @pv
ret void
@ -229,7 +229,7 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
%foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4
%foo7 = load float, ptr addrspace(5) %foo6
@ -266,7 +266,7 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
ret void
}
@ -289,16 +289,16 @@ define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
%foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
store float 3.0, ptr addrspace(5) %foo5
call void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
call void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
%foo6 = load float, ptr addrspace(5) %f1
store float %foo6, ptr addrspace(1) @pv
ret void
}
declare void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
@frag_color = external addrspace(1) global <4 x float>

View File

@ -11,15 +11,15 @@
@sched_dbg_value_crash.tmp6 = internal unnamed_addr addrspace(3) global [256 x [16 x i8]] undef, align 16
define amdgpu_kernel void @sched_dbg_value_crash(i8 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture readonly %arg1, %struct.widget.0 addrspace(1)* nocapture readonly %arg2, %struct.baz addrspace(1)* nocapture readonly %arg3, %struct.snork addrspace(1)* nocapture %arg4) local_unnamed_addr #2 {
define amdgpu_kernel void @sched_dbg_value_crash(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture readonly %arg2, ptr addrspace(1) nocapture readonly %arg3, ptr addrspace(1) nocapture %arg4) local_unnamed_addr #2 {
bb:
%0 = getelementptr i32, i32 addrspace(1)* %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3
%0 = getelementptr i32, ptr addrspace(1) %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3
%tmp5 = alloca %struct.wombat, align 16, addrspace(5)
%1 = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
%2 = bitcast i8 addrspace(4)* %1 to i32 addrspace(4)*
%3 = getelementptr inbounds i32, i32 addrspace(4)* %2, i64 1
%4 = bitcast i32 addrspace(4)* %3 to <2 x i32> addrspace(4)*, !amdgpu.uniform !3, !amdgpu.noclobber !3
%5 = load <2 x i32>, <2 x i32> addrspace(4)* %4, align 4, !invariant.load !3
%1 = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
%2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
%3 = getelementptr inbounds i32, ptr addrspace(4) %2, i64 1
%4 = bitcast ptr addrspace(4) %3 to ptr addrspace(4), !amdgpu.uniform !3, !amdgpu.noclobber !3
%5 = load <2 x i32>, ptr addrspace(4) %4, align 4, !invariant.load !3
%6 = extractelement <2 x i32> %5, i32 0
%7 = extractelement <2 x i32> %5, i32 1
%8 = lshr i32 %6, 16
@ -31,69 +31,69 @@
%14 = mul nuw nsw i32 %10, %7
%15 = add i32 %13, %14
%16 = add i32 %15, %11
%17 = getelementptr inbounds [256 x [16 x i8]], [256 x [16 x i8]] addrspace(3)* @sched_dbg_value_crash.tmp6, i32 0, i32 %16
%tmp7 = load i64, i64 addrspace(4)* null, align 536870912
%17 = getelementptr inbounds [256 x [16 x i8]], ptr addrspace(3) @sched_dbg_value_crash.tmp6, i32 0, i32 %16
%tmp7 = load i64, ptr addrspace(4) null, align 536870912
%tmp8 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !4
%tmp9 = zext i32 %tmp8 to i64
%tmp10 = add i64 %tmp7, %tmp9
%tmp11 = shl i64 %tmp10, 32
%tmp12 = ashr exact i64 %tmp11, 32
%tmp13 = getelementptr inbounds %struct.widget.0, %struct.widget.0 addrspace(1)* %arg2, i64 %tmp12, i32 1
%tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
%tmp15 = getelementptr inbounds %struct.baz, %struct.baz addrspace(1)* %arg3, i64 %tmp12, i32 1
%tmp16 = load <4 x float>, <4 x float> addrspace(1)* %tmp15, align 16
%tmp13 = getelementptr inbounds %struct.widget.0, ptr addrspace(1) %arg2, i64 %tmp12, i32 1
%tmp14 = load i32, ptr addrspace(1) %tmp13, align 4
%tmp15 = getelementptr inbounds %struct.baz, ptr addrspace(1) %arg3, i64 %tmp12, i32 1
%tmp16 = load <4 x float>, ptr addrspace(1) %tmp15, align 16
%tmp17 = sext i32 %tmp14 to i64
%tmp18 = load i32, i32 addrspace(1)* %0, align 4
%tmp18 = load i32, ptr addrspace(1) %0, align 4
%tmp19 = zext i32 %tmp18 to i64
%tmp20 = shl nuw nsw i64 %tmp19, 2
%tmp21 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp20
%tmp22 = bitcast i8 addrspace(1)* %tmp21 to %struct.wombat.1 addrspace(1)*
%tmp23 = bitcast %struct.wombat addrspace(5)* %tmp5 to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 144, i8 addrspace(5)* nonnull %tmp23) #3
%tmp24 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(5)* %tmp5, i32 0, i32 6
%tmp25 = getelementptr i32, i32 addrspace(1)* %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3
%tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4
%tmp21 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp20
%tmp22 = bitcast ptr addrspace(1) %tmp21 to ptr addrspace(1)
%tmp23 = bitcast ptr addrspace(5) %tmp5 to ptr addrspace(5)
call void @llvm.lifetime.start.p5(i64 144, ptr addrspace(5) nonnull %tmp23) #3
%tmp24 = getelementptr inbounds %struct.wombat, ptr addrspace(5) %tmp5, i32 0, i32 6
%tmp25 = getelementptr i32, ptr addrspace(1) %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3
%tmp26 = load i32, ptr addrspace(1) %tmp25, align 4
%tmp27 = zext i32 %tmp26 to i64
%tmp28 = shl nuw nsw i64 %tmp27, 2
%tmp29 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp28
%tmp30 = bitcast i8 addrspace(1)* %tmp29 to <2 x float> addrspace(1)*
%tmp31 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 2, i64 0
%18 = bitcast i32 addrspace(1)* %tmp31 to <3 x i32> addrspace(1)*
%19 = load <3 x i32>, <3 x i32> addrspace(1)* %18, align 4
%tmp29 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp28
%tmp30 = bitcast ptr addrspace(1) %tmp29 to ptr addrspace(1)
%tmp31 = getelementptr inbounds %struct.wombat.1, ptr addrspace(1) %tmp22, i64 %tmp17, i32 2, i64 0
%18 = bitcast ptr addrspace(1) %tmp31 to ptr addrspace(1)
%19 = load <3 x i32>, ptr addrspace(1) %18, align 4
%tmp325 = extractelement <3 x i32> %19, i32 0
%tmp386 = extractelement <3 x i32> %19, i32 1
%tmp447 = extractelement <3 x i32> %19, i32 2
%tmp33 = sext i32 %tmp325 to i64
%tmp34 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp33
%tmp35 = load <2 x float>, <2 x float> addrspace(1)* %tmp34, align 8
%tmp34 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp33
%tmp35 = load <2 x float>, ptr addrspace(1) %tmp34, align 8
%tmp36 = extractelement <2 x float> %tmp35, i32 1
%tmp39 = sext i32 %tmp386 to i64
%tmp40 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp39
%tmp41 = load <2 x float>, <2 x float> addrspace(1)* %tmp40, align 8
%tmp40 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp39
%tmp41 = load <2 x float>, ptr addrspace(1) %tmp40, align 8
%tmp42 = extractelement <2 x float> %tmp41, i32 1
%tmp45 = sext i32 %tmp447 to i64
%tmp46 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp45
%tmp47 = load <2 x float>, <2 x float> addrspace(1)* %tmp46, align 8
%tmp46 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp45
%tmp47 = load <2 x float>, ptr addrspace(1) %tmp46, align 8
%tmp48 = extractelement <2 x float> %tmp47, i32 1
%tmp49 = getelementptr i32, i32 addrspace(1)* %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3
%tmp50 = load i32, i32 addrspace(1)* %tmp49, align 4
%tmp49 = getelementptr i32, ptr addrspace(1) %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3
%tmp50 = load i32, ptr addrspace(1) %tmp49, align 4
%tmp51 = zext i32 %tmp50 to i64
%tmp52 = shl nuw nsw i64 %tmp51, 2
%tmp53 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp52
%tmp54 = bitcast i8 addrspace(1)* %tmp53 to <4 x float> addrspace(1)*
%tmp55 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 0, i64 0
%20 = bitcast i32 addrspace(1)* %tmp55 to <2 x i32> addrspace(1)*
%21 = load <2 x i32>, <2 x i32> addrspace(1)* %20, align 4
%tmp53 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp52
%tmp54 = bitcast ptr addrspace(1) %tmp53 to ptr addrspace(1)
%tmp55 = getelementptr inbounds %struct.wombat.1, ptr addrspace(1) %tmp22, i64 %tmp17, i32 0, i64 0
%20 = bitcast ptr addrspace(1) %tmp55 to ptr addrspace(1)
%21 = load <2 x i32>, ptr addrspace(1) %20, align 4
%tmp568 = extractelement <2 x i32> %21, i32 0
%tmp639 = extractelement <2 x i32> %21, i32 1
%tmp57 = sext i32 %tmp568 to i64
%tmp58 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp57
%tmp59 = load <4 x float>, <4 x float> addrspace(1)* %tmp58, align 16
%tmp58 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp54, i64 %tmp57
%tmp59 = load <4 x float>, ptr addrspace(1) %tmp58, align 16
%tmp60 = extractelement <4 x float> %tmp59, i32 0
%tmp61 = extractelement <4 x float> %tmp59, i32 1
%tmp64 = sext i32 %tmp639 to i64
%tmp65 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp64
%tmp66 = load <4 x float>, <4 x float> addrspace(1)* %tmp65, align 16
%tmp65 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp54, i64 %tmp64
%tmp66 = load <4 x float>, ptr addrspace(1) %tmp65, align 16
%tmp67 = extractelement <4 x float> %tmp16, i64 0
%tmp69 = fsub fast float -0.000000e+00, %tmp67
%tmp70 = fmul float %tmp67, 0.000000e+00
@ -103,7 +103,7 @@
%tmp74 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %tmp69, i32 0
%tmp75 = insertelement <4 x float> %tmp74, float %tmp71, i32 1
%tmp76 = insertelement <4 x float> %tmp75, float %tmp73, i32 2
store <4 x float> %tmp76, <4 x float> addrspace(5)* %tmp24, align 16
store <4 x float> %tmp76, ptr addrspace(5) %tmp24, align 16
%tmp77 = fsub float undef, %tmp60
%tmp78 = fsub float undef, %tmp61
%tmp79 = extractelement <4 x float> %tmp66, i32 2
@ -128,27 +128,27 @@
%fadd = fadd <2 x float> %fmul, undef
%extractelement = extractelement <2 x float> %fadd, i64 1
%tmp96 = fsub float %extractelement, %tmp95
%tmp97 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(5)* %tmp5, i32 0, i32 8, i32 1
call void @func(float %tmp96, i64 0, i16 addrspace(5)* nonnull %tmp97) #3
%tmp984 = bitcast [16 x i8] addrspace(3)* %17 to i8 addrspace(3)*
%tmp99 = getelementptr inbounds %struct.snork, %struct.snork addrspace(1)* %arg4, i64 %tmp12, i32 8, i32 1, i64 0
call void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* %tmp99, i8 addrspace(3)* %tmp984, i64 16, i32 16, i1 false)
call void @llvm.lifetime.end.p5i8(i64 144, i8 addrspace(5)* nonnull %tmp23) #3
%tmp97 = getelementptr inbounds %struct.wombat, ptr addrspace(5) %tmp5, i32 0, i32 8, i32 1
call void @func(float %tmp96, i64 0, ptr addrspace(5) nonnull %tmp97) #3
%tmp984 = bitcast ptr addrspace(3) %17 to ptr addrspace(3)
%tmp99 = getelementptr inbounds %struct.snork, ptr addrspace(1) %arg4, i64 %tmp12, i32 8, i32 1, i64 0
call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) %tmp99, ptr addrspace(3) %tmp984, i64 16, i32 16, i1 false)
call void @llvm.lifetime.end.p5(i64 144, ptr addrspace(5) nonnull %tmp23) #3
ret void
}
declare void @func(float, i64, i16 addrspace(5)*)
declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
declare void @func(float, i64, ptr addrspace(5))
declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0
declare float @llvm.fmuladd.f32(float, float, float) #1
declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0
declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0
declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare void @llvm.dbg.value(metadata, metadata, metadata) #1
declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
declare i32 @llvm.amdgcn.workitem.id.y() #1
declare i32 @llvm.amdgcn.workitem.id.z() #1
declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i32, i1) #0
declare void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i64, i32, i1) #0
declare void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i32, i1) #0
declare void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i32, i1) #0
attributes #0 = { argmemonly nounwind }
attributes #1 = { nounwind readnone speculatable }
@ -203,9 +203,9 @@ body: |
%2:vgpr_32 = COPY $vgpr2
%1:vgpr_32 = COPY $vgpr1
%0:vgpr_32 = COPY $vgpr0
%5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
%6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
%7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
%5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
%6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
%7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
%8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 24, 0
%9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 32, 0
%10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 4, 0

View File

@ -26,7 +26,7 @@
source_filename = "sdwa-scalar-ops.opt.ll"
target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) {
define amdgpu_kernel void @sdwa_imm_operand(ptr addrspace(1) nocapture %arg) {
bb:
br label %bb2
@ -35,29 +35,29 @@
bb2: ; preds = %bb2, %bb
%lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
%bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
%uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
%uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
%tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
%bc = bitcast ptr addrspace(1) %arg to ptr addrspace(1)
%uglygep4 = getelementptr i8, ptr addrspace(1) %bc, i64 %lsr.iv
%uglygep45 = bitcast ptr addrspace(1) %uglygep4 to ptr addrspace(1)
%tmp5 = load i32, ptr addrspace(1) %uglygep45, align 4
%tmp6 = lshr i32 %tmp5, 8
%tmp7 = and i32 %tmp6, 255
%tmp8 = zext i32 %tmp7 to i64
%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
store i32 1, i32 addrspace(1)* %tmp9, align 4
%scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
%tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp8
store i32 1, ptr addrspace(1) %tmp9, align 4
%scevgep = getelementptr i32, ptr addrspace(1) %uglygep45, i64 1
%tmp13 = load i32, ptr addrspace(1) %scevgep, align 4
%tmp14 = lshr i32 %tmp13, 8
%tmp15 = and i32 %tmp14, 255
%tmp16 = zext i32 %tmp15 to i64
%tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
store i32 1, i32 addrspace(1)* %tmp17, align 4
%tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16
store i32 1, ptr addrspace(1) %tmp17, align 4
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
%tmp1 = trunc i64 %lsr.iv.next to i32
%tmp19 = icmp eq i32 %tmp1, 4096
br i1 %tmp19, label %bb1, label %bb2
}
define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) {
define amdgpu_kernel void @sdwa_sgpr_operand(ptr addrspace(1) nocapture %arg) {
bb:
br label %bb2
@ -66,22 +66,22 @@
bb2: ; preds = %bb2, %bb
%lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
%bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
%uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
%uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
%tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
%bc = bitcast ptr addrspace(1) %arg to ptr addrspace(1)
%uglygep4 = getelementptr i8, ptr addrspace(1) %bc, i64 %lsr.iv
%uglygep45 = bitcast ptr addrspace(1) %uglygep4 to ptr addrspace(1)
%tmp5 = load i32, ptr addrspace(1) %uglygep45, align 4
%tmp6 = lshr i32 %tmp5, 8
%tmp7 = and i32 %tmp6, 255
%tmp8 = zext i32 %tmp7 to i64
%tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
store i32 1, i32 addrspace(1)* %tmp9, align 4
%scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
%tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
%tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp8
store i32 1, ptr addrspace(1) %tmp9, align 4
%scevgep = getelementptr i32, ptr addrspace(1) %uglygep45, i64 1
%tmp13 = load i32, ptr addrspace(1) %scevgep, align 4
%tmp14 = lshr i32 %tmp13, 8
%tmp15 = and i32 %tmp14, 255
%tmp16 = zext i32 %tmp15 to i64
%tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
store i32 1, i32 addrspace(1)* %tmp17, align 4
%tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16
store i32 1, ptr addrspace(1) %tmp17, align 4
%lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
%tmp1 = trunc i64 %lsr.iv.next to i32
%tmp19 = icmp eq i32 %tmp1, 4096
@ -203,7 +203,7 @@ body: |
liveins: $sgpr4_sgpr5
%4 = COPY $sgpr4_sgpr5
%9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
%9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
%8 = S_MOV_B64 0
%7 = COPY %9
%30 = V_MOV_B32_e32 1, implicit $exec
@ -365,7 +365,7 @@ body: |
liveins: $sgpr4_sgpr5
%4 = COPY $sgpr4_sgpr5
%9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
%9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
%8 = S_MOV_B64 0
%7 = COPY %9
%30 = V_MOV_B32_e32 1, implicit $exec

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -14,11 +14,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -31,11 +31,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -48,11 +48,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -65,11 +65,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -82,11 +82,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -99,11 +99,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -114,11 +114,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -129,11 +129,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
@ -144,11 +144,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
bb:
%fneg.C = fneg <8 x half> %C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
@ -159,11 +159,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -176,11 +176,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -193,11 +193,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -210,11 +210,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -227,11 +227,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -244,11 +244,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -261,11 +261,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -278,11 +278,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
bb:
%fneg.C = fneg <8 x float> %C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -295,11 +295,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -329,11 +329,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -344,11 +344,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -359,13 +359,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
bb:
%fneg.B = fneg <16 x half> %B
%res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -379,11 +379,11 @@ bb:
%fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
%fneg.fabs.C = fneg <8 x float> %fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -395,11 +395,11 @@ bb:
%fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
%fneg.fabs.C = fneg <8 x half> %fabs.C
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v11, 0x7fffffff, v11
@ -417,13 +417,13 @@ bb:
%partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -436,11 +436,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
bb:
%fneg.A = fneg <8 x half> %A
%res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
store <8 x float> %res, <8 x float> addrspace(1)* %out
store <8 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -451,13 +451,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
bb:
%fneg.B = fneg <8 x half> %B
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: s_clause 0x1
@ -480,7 +480,7 @@ bb:
%C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%fneg.C_shuffle = fneg <8 x half> %C_shuffle
%res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
store <8 x half> %res, <8 x half> addrspace(1)* %out
store <8 x half> %res, ptr addrspace(1) %out
ret void
}

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -16,7 +16,7 @@ bb:
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -27,11 +27,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -42,11 +42,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -57,11 +57,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -72,11 +72,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -87,11 +87,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -102,11 +102,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -117,11 +117,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
@ -132,11 +132,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
bb:
%fneg.C = fneg <4 x half> %C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
@ -147,11 +147,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -162,11 +162,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -177,11 +177,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -192,11 +192,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -207,11 +207,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -222,11 +222,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -237,11 +237,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -252,11 +252,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
bb:
%fneg.C = fneg <4 x float> %C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -267,11 +267,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -282,11 +282,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -297,11 +297,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -327,13 +327,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
bb:
%fneg.B = fneg <8 x half> %B
%res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
; both neg and abs patterns (wmma matrix C f32 or f16 )
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -345,11 +345,11 @@ bb:
%fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
%fneg.fabs.C = fneg <4 x float> %fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -361,11 +361,11 @@ bb:
%fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
%fneg.fabs.C = fneg <4 x half> %fabs.C
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_and_b32_e32 v7, 0x7fffffff, v7
@ -381,13 +381,13 @@ bb:
%partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
%fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
; A or B matrix modifier and constant in C
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -398,11 +398,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
bb:
%fneg.A = fneg <4 x half> %A
%res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %res, <4 x float> addrspace(1)* %out
store <4 x float> %res, ptr addrspace(1) %out
ret void
}
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -413,13 +413,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
bb:
%fneg.B = fneg <4 x half> %B
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}
; pack f16 elements with v_perm_b32 since they don't come from same b32
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
; GFX12: ; %bb.0: ; %bb
; GFX12-NEXT: flat_load_b128 v[8:11], v[4:5]
@ -437,7 +437,7 @@ bb:
%C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%fneg.C_shuffle = fneg <4 x half> %C_shuffle
%res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
store <4 x half> %res, <4 x half> addrspace(1)* %out
store <4 x half> %res, ptr addrspace(1) %out
ret void
}