[AMDGPU] Convert tests to opaque pointers (NFC)

2024-02-05 12:41:37 +01:00 · 2024-02-05 12:41:37 +01:00 · 00a4e248dc
commit 00a4e248dc
parent 1d3d8936ba
26 changed files with 634 additions and 634 deletions
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
@ -2,7 +2,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX12 %s

-define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-LABEL: s_add_u64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
@ -35,11 +35,11 @@ define amdgpu_kernel void @s_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_endpgm
 entry:
  %add = add i64 %a, %b
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_ps void @v_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GCN-LABEL: v_add_u64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
@ -50,11 +50,11 @@ define amdgpu_ps void @v_add_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GCN-NEXT:    s_endpgm
 entry:
  %add = add i64 %a, %b
-  store i64 %add, i64 addrspace(1)* %out
+  store i64 %add, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GFX11-LABEL: s_sub_u64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
@ -87,11 +87,11 @@ define amdgpu_kernel void @s_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GFX12-NEXT:    s_endpgm
 entry:
  %sub = sub i64 %a, %b
-  store i64 %sub, i64 addrspace(1)* %out
+  store i64 %sub, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_ps void @v_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ; GCN-LABEL: v_sub_u64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, v4
@ -102,6 +102,6 @@ define amdgpu_ps void @v_sub_u64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GCN-NEXT:    s_endpgm
 entry:
  %sub = sub i64 %a, %b
-  store i64 %sub, i64 addrspace(1)* %out
+  store i64 %sub, ptr addrspace(1) %out
  ret void
 }
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-divrem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-divrem.mir
@ -327,8 +327,8 @@ body: |
    %ptr2:_(p1) = G_IMPLICIT_DEF
    %ptr3:_(p1) = COPY $vgpr2_vgpr3
    %ptr4:_(p1) = COPY $vgpr4_vgpr5
-    G_STORE %src1:_(s32), %ptr1:_(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-    G_STORE %src2:_(s32), %ptr2:_(p1) :: (volatile store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    G_STORE %src1:_(s32), %ptr1:_(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    G_STORE %src2:_(s32), %ptr2:_(p1) :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
    %div:_(s32) = G_SDIV %src1:_(s32), %src2:_(s32)
    G_STORE %div:_(s32), %ptr3:_(p1) :: (store (s32), addrspace 1, align 4)
    %rem:_(s32) = G_SREM %src1:_(s32), %src2:_(s32)
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -14,11 +14,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -31,11 +31,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -48,11 +48,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -65,11 +65,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -82,11 +82,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -99,11 +99,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -114,11 +114,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -129,11 +129,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
@ -144,11 +144,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 bb:
  %fneg.C = fneg <8 x half> %C
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
@ -159,11 +159,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 bb:
  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -176,11 +176,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -193,11 +193,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -210,11 +210,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -227,11 +227,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -244,11 +244,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -261,11 +261,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -278,11 +278,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -295,11 +295,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -329,11 +329,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 bb:
  %fneg.B = fneg <16 x half> %B
  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -344,11 +344,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -359,13 +359,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 bb:
  %fneg.B = fneg <16 x half> %B
  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; both neg and abs patterns (wmma matrix C f32 or f16 )

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -379,11 +379,11 @@ bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %fneg.fabs.C = fneg <8 x float> %fabs.C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -395,11 +395,11 @@ bb:
  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
  %fneg.fabs.C = fneg <8 x half> %fabs.C
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
@ -417,13 +417,13 @@ bb:
  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

 ; A or B matrix modifier and constant in C

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -436,11 +436,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -451,13 +451,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; pack f16 elements with v_perm_b32 since they don't come from same b32

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_clause 0x1
@ -485,7 +485,7 @@ bb:
  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -16,7 +16,7 @@ bb:
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -27,11 +27,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 bb:
  %fneg.B = fneg <4 x half> %B
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -42,11 +42,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -57,11 +57,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -72,11 +72,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -87,11 +87,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -102,11 +102,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -117,11 +117,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 bb:
  %fneg.B = fneg <4 x half> %B
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
@ -132,11 +132,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 bb:
  %fneg.C = fneg <4 x half> %C
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
@ -147,11 +147,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 bb:
  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -162,11 +162,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -177,11 +177,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -192,11 +192,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -207,11 +207,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -222,11 +222,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -237,11 +237,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -252,11 +252,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -267,11 +267,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -282,11 +282,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -297,11 +297,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -327,13 +327,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; both neg and abs patterns (wmma matrix C f32 or f16 )

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -345,11 +345,11 @@ bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %fneg.fabs.C = fneg <4 x float> %fabs.C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -361,11 +361,11 @@ bb:
  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
  %fneg.fabs.C = fneg <4 x half> %fabs.C
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
@ -381,13 +381,13 @@ bb:
  %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
  %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

 ; A or B matrix modifier and constant in C

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -398,11 +398,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -413,13 +413,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
 bb:
  %fneg.B = fneg <4 x half> %B
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; pack f16 elements with v_perm_b32 since they don't come from same b32

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
@ -440,7 +440,7 @@ bb:
  %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %fneg.C_shuffle = fneg <4 x half> %C_shuffle
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@ -2574,7 +2574,7 @@ define void @test_arg_store_v2bf16(<2 x bfloat> %in, ptr addrspace(1) %out) {
  ret void
 }

-define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)* %out) {
+define void @test_arg_store_v3bf16(<3 x bfloat> %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_arg_store_v3bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@ -2637,7 +2637,7 @@ define void @test_arg_store_v3bf16(<3 x bfloat> %in, <3 x bfloat> addrspace(1)*
 ; GFX11-NEXT:    global_store_b16 v[2:3], v1, off offset:4
 ; GFX11-NEXT:    global_store_b32 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  store <3 x bfloat> %in, <3 x bfloat> addrspace(1) * %out
+  store <3 x bfloat> %in, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-debug-info.mir
@ -15,25 +15,25 @@

 --- |

-  define amdgpu_kernel void @long_branch_dbg_value(float addrspace(1)* nocapture %arg, float %arg1) #1 !dbg !5 {
+  define amdgpu_kernel void @long_branch_dbg_value(ptr addrspace(1) nocapture %arg, float %arg1) #1 !dbg !5 {
  bb:
-    %long_branch_dbg_value.kernarg.segment = call nonnull align 16 dereferenceable(12) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
-    %arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %long_branch_dbg_value.kernarg.segment, i64 0
-    %arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to float addrspace(1)* addrspace(4)*, !amdgpu.uniform !2, !amdgpu.noclobber !2
-    %arg.load = load float addrspace(1)*, float addrspace(1)* addrspace(4)* %arg.kernarg.offset.cast, align 16, !invariant.load !2
-    %arg1.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %long_branch_dbg_value.kernarg.segment, i64 8
-    %arg1.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg1.kernarg.offset to float addrspace(4)*, !amdgpu.uniform !2, !amdgpu.noclobber !2
-    %arg1.load = load float, float addrspace(4)* %arg1.kernarg.offset.cast, align 8, !invariant.load !2
+    %long_branch_dbg_value.kernarg.segment = call nonnull align 16 dereferenceable(12) ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
+    %arg.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_dbg_value.kernarg.segment, i64 0
+    %arg.kernarg.offset.cast = bitcast ptr addrspace(4) %arg.kernarg.offset to ptr addrspace(4), !amdgpu.uniform !2, !amdgpu.noclobber !2
+    %arg.load = load ptr addrspace(1), ptr addrspace(4) %arg.kernarg.offset.cast, align 16, !invariant.load !2
+    %arg1.kernarg.offset = getelementptr inbounds i8, ptr addrspace(4) %long_branch_dbg_value.kernarg.segment, i64 8
+    %arg1.kernarg.offset.cast = bitcast ptr addrspace(4) %arg1.kernarg.offset to ptr addrspace(4), !amdgpu.uniform !2, !amdgpu.noclobber !2
+    %arg1.load = load float, ptr addrspace(4) %arg1.kernarg.offset.cast, align 8, !invariant.load !2
    %tmp = fmul float %arg1.load, %arg1.load
-    %tmp2 = getelementptr inbounds float, float addrspace(1)* %arg.load, i64 3
-    call void @llvm.dbg.value(metadata float addrspace(1)* %tmp2, metadata !11, metadata !DIExpression()) #5, !dbg !12
-    store float %tmp, float addrspace(1)* %tmp2, align 4, !dbg !12
+    %tmp2 = getelementptr inbounds float, ptr addrspace(1) %arg.load, i64 3
+    call void @llvm.dbg.value(metadata ptr addrspace(1) %tmp2, metadata !11, metadata !DIExpression()) #5, !dbg !12
+    store float %tmp, ptr addrspace(1) %tmp2, align 4, !dbg !12
    %tmp3 = fcmp olt float %tmp, 0x3810000000000000
    %tmp3.inv = xor i1 %tmp3, true
    br i1 %tmp3.inv, label %bb4, label %bb8, !amdgpu.uniform !2

  bb4:                                              ; preds = %bb
-    %tmp5 = load volatile float, float addrspace(1)* undef, align 4
+    %tmp5 = load volatile float, ptr addrspace(1) undef, align 4
    %tmp6 = fcmp oeq float %tmp5, 0x7FF0000000000000
    br i1 %tmp6, label %bb7, label %Flow, !amdgpu.uniform !2

@ -47,7 +47,7 @@
    ret void
  }

-  declare align 4 i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
+  declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #2
  declare void @llvm.dbg.value(metadata, metadata, metadata) #0

  attributes #0 = { nounwind readnone speculatable willreturn }
@ -103,7 +103,7 @@ body:             |
    renamable $sgpr4_sgpr5 = IMPLICIT_DEF
    $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr4_sgpr5
    $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit killed $sgpr4_sgpr5, implicit $exec
-    renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile load (s32) from `float addrspace(1)* undef`, addrspace 1)
+    renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    renamable $sgpr4 = S_MOV_B32 2139095040
    S_WAITCNT 3952
    renamable $sgpr4_sgpr5 = nofpexcept V_CMP_NEQ_F32_e64 0, killed $sgpr4, 0, killed $vgpr0, 0, implicit $mode, implicit $exec
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@ -1385,7 +1385,7 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(ptr addrspace(1) noalias

 ; The other use of shuffle0_0 make it profitable to lower into v_perm

-define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out1, <4 x i8> addrspace(1)* noalias %in, <4 x i8> addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %out1, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %in1) nounwind {
 ; SI-LABEL: load_v4i8_to_v4f32_unaligned_multiuse:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0x9
@ -1547,14 +1547,14 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned_multiuse(<4 x float> add
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
-  %gep1 = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in1, i32 %tid
-  %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
-  %load1 = load <4 x i8>, <4 x i8> addrspace(1)* %gep1, align 1
+  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid
+  %gep1 = getelementptr <4 x i8>, ptr addrspace(1) %in1, i32 %tid
+  %load = load <4 x i8>, ptr addrspace(1) %gep, align 1
+  %load1 = load <4 x i8>, ptr addrspace(1) %gep1, align 1
  %shuffle0_0 = shufflevector <4 x i8> %load, <4 x i8> %load1, <4 x i32> <i32 3, i32 2, i32 6, i32 2>
  %cvt = uitofp <4 x i8> %shuffle0_0 to <4 x float>
-  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
-  store <4 x i8> %shuffle0_0, <4 x i8> addrspace(1)* %out1, align 4
+  store <4 x float> %cvt, ptr addrspace(1) %out, align 16
+  store <4 x i8> %shuffle0_0, ptr addrspace(1) %out1, align 4
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s

 --- |
-  define amdgpu_kernel void @single-wave-phase-2b(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17, i32 addrspace(7)* noalias %in18, i32 addrspace(7)* noalias %in19, i32 addrspace(7)* noalias %in20, i32 addrspace(7)* noalias %in21, i32 addrspace(7)* noalias %in22, i32 addrspace(7)* noalias %in23, i32 addrspace(7)* noalias %in24, i32 addrspace(7)* noalias %in25, i32 addrspace(7)* noalias %in26, i32 addrspace(7)* noalias %in27, i32 addrspace(7)* noalias %in28, i32 addrspace(7)* noalias %in29) #0 { ret void }
+  define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void }

  !0 = distinct !{!0}
  !1 = !{!1, !0}
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2c.mir
@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=machine-scheduler -verify-misched -o - %s | FileCheck -check-prefix=GCN %s

 --- |
-  define amdgpu_kernel void @single-wave-phase-2c(i32 addrspace(3)* noalias %in0, i32 addrspace(3)* noalias %in1, i32 addrspace(3)* noalias %in2, i32 addrspace(3)* noalias %in3, i32 addrspace(3)* noalias %in4, i32 addrspace(3)* noalias %in5, i32 addrspace(3)* noalias %in6, i32 addrspace(3)* noalias %in7, i32 addrspace(3)* noalias %in8, i32 addrspace(3)* noalias %in9, i32 addrspace(3)* noalias %in10, i32 addrspace(3)* noalias %in11, i32 addrspace(7)* noalias %in12, i32 addrspace(7)* noalias %in13, i32 addrspace(7)* noalias %in14, i32 addrspace(7)* noalias %in15, i32 addrspace(7)* noalias %in16, i32 addrspace(7)* noalias %in17) #0 { ret void }
+  define amdgpu_kernel void @single-wave-phase-2c(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17) #0 { ret void }


  !0 = distinct !{!0}
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@ -176,10 +176,10 @@ define amdgpu_kernel void @safe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %s
 ; SI-NOT: v_rsq_f64_e32
 ; SI: v_sqrt_f64
 ; SI: v_rcp_f64
-define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #1 {
  %sqrt = call double @llvm.amdgcn.sqrt.f64(double %src)
  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
  ret void
 }

@ -195,10 +195,10 @@ define amdgpu_kernel void @safe_amdgcn_sqrt_rsq_rcp_pat_f64(double addrspace(1)*
 ; SI: v_fma_f64
 ; SI: v_rcp_f64
 ; SI: buffer_store_dwordx2
-define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(ptr addrspace(1) %out, double %src) #2 {
  %sqrt = call double @llvm.sqrt.f64(double %src)
  %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
-  store double %rcp, double addrspace(1)* %out, align 8
+  store double %rcp, ptr addrspace(1) %out, align 8
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@ -82,8 +82,8 @@ entry:
 ; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %tmp0 = load <9 x float>, <9 x float> addrspace(1)* %in
-  store <9 x float> %tmp0, <9 x float> addrspace(1)* %out
+  %tmp0 = load <9 x float>, ptr addrspace(1) %in
+  store <9 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

@ -101,8 +101,8 @@ entry:
 ; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %tmp0 = load <10 x float>, <10 x float> addrspace(1)* %in
-  store <10 x float> %tmp0, <10 x float> addrspace(1)* %out
+  %tmp0 = load <10 x float>, ptr addrspace(1) %in
+  store <10 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

@ -122,8 +122,8 @@ entry:
 ; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %tmp0 = load <11 x float>, <11 x float> addrspace(1)* %in
-  store <11 x float> %tmp0, <11 x float> addrspace(1)* %out
+  %tmp0 = load <11 x float>, ptr addrspace(1) %in
+  store <11 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

@ -140,8 +140,8 @@ entry:
 ; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 entry:
-  %tmp0 = load <12 x float>, <12 x float> addrspace(1)* %in
-  store <12 x float> %tmp0, <12 x float> addrspace(1)* %out
+  %tmp0 = load <12 x float>, ptr addrspace(1) %in
+  store <12 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@ -6,13 +6,13 @@

 ; Check a constructor that's an alias, and an integer literal.
@llvm.global_ctors = appending addrspace(1) global [2 x { i32, ptr, ptr }] [
-  { i32, ptr, ptr } { i32 1, ptr @foo.alias, i8* null },
-  { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), i8* null }
+  { i32, ptr, ptr } { i32 1, ptr @foo.alias, ptr null },
+  { i32, ptr, ptr } { i32 1, ptr inttoptr (i64 4096 to ptr), ptr null }
 ]

 ; Check a constantexpr addrspacecast
@llvm.global_dtors = appending addrspace(1) global [1 x { i32, ptr, ptr }] [
-  { i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), i8* null }
+  { i32, ptr, ptr } { i32 1, ptr addrspacecast (ptr addrspace(1) @bar to ptr), ptr null }
 ]

@foo.alias = hidden alias void (), ptr @foo
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
@ -1,30 +1,30 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx803 -run-pass si-memory-legalizer  %s -o - | FileCheck %s

 --- |
-  define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+  define amdgpu_kernel void @multiple_mem_operands(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
  entry:
    %scratch0 = alloca [8192 x i32], addrspace(5)
    %scratch1 = alloca [8192 x i32], addrspace(5)
-    %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)*
-    store i32 1, i32 addrspace(5)* %scratchptr01
-    %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)*
-    store i32 2, i32 addrspace(5)* %scratchptr12
+    %scratchptr01 = bitcast ptr addrspace(5) %scratch0 to ptr addrspace(5)
+    store i32 1, ptr addrspace(5) %scratchptr01
+    %scratchptr12 = bitcast ptr addrspace(5) %scratch1 to ptr addrspace(5)
+    store i32 2, ptr addrspace(5) %scratchptr12
    %cmp = icmp eq i32 %cond, 0
    br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0

  if:                                               ; preds = %entry
-    %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
-    %if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1
+    %if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+    %if_value = load i32, ptr addrspace(5) %if_ptr, align 4, !nontemporal !1
    br label %done, !structurizecfg.uniform !0

  else:                                             ; preds = %entry
-    %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
-    %else_value = load i32, i32 addrspace(5)* %else_ptr, align 4, !nontemporal !1
+    %else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+    %else_value = load i32, ptr addrspace(5) %else_ptr, align 4, !nontemporal !1
    br label %done, !structurizecfg.uniform !0

  done:                                             ; preds = %else, %if
    %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
-    store i32 %value, i32 addrspace(1)* %out
+    store i32 %value, ptr addrspace(1) %out
    ret void
  }

@ -110,9 +110,9 @@ body:             |
    successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
    liveins: $sgpr0_sgpr1, $sgpr3

-    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
    $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-    $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
+    $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
    $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
    $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
    $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
@ -130,7 +130,7 @@ body:             |
    successors: %bb.3.done(0x80000000)
    liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11

-    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
    S_WAITCNT 3855
    $vgpr0 = V_MOV_B32_e32 32772, implicit $exec
    S_BRANCH %bb.3.done
@ -139,7 +139,7 @@ body:             |
    successors: %bb.3.done(0x80000000)
    liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11

-    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
    S_WAITCNT 3855
    $vgpr0 = V_MOV_B32_e32 4, implicit $exec

--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir
@ -2,30 +2,30 @@

 --- |

-  define amdgpu_kernel void @multiple_mem_operands(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
+  define amdgpu_kernel void @multiple_mem_operands(ptr addrspace(1) %out, i32 %cond, i32 %if_offset, i32 %else_offset) #0 {
  entry:
    %scratch0 = alloca [8192 x i32], addrspace(5)
    %scratch1 = alloca [8192 x i32], addrspace(5)
-    %scratchptr01 = bitcast [8192 x i32] addrspace(5)* %scratch0 to i32 addrspace(5)*
-    store i32 1, i32 addrspace(5)* %scratchptr01
-    %scratchptr12 = bitcast [8192 x i32] addrspace(5)* %scratch1 to i32 addrspace(5)*
-    store i32 2, i32 addrspace(5)* %scratchptr12
+    %scratchptr01 = bitcast ptr addrspace(5) %scratch0 to ptr addrspace(5)
+    store i32 1, ptr addrspace(5) %scratchptr01
+    %scratchptr12 = bitcast ptr addrspace(5) %scratch1 to ptr addrspace(5)
+    store i32 2, ptr addrspace(5) %scratchptr12
    %cmp = icmp eq i32 %cond, 0
    br i1 %cmp, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0

  if:                                               ; preds = %entry
-    %if_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
-    %if_value = load i32, i32 addrspace(5)* %if_ptr, align 4, !nontemporal !1
+    %if_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch0, i32 0, i32 %if_offset, !amdgpu.uniform !0
+    %if_value = load i32, ptr addrspace(5) %if_ptr, align 4, !nontemporal !1
    br label %done, !structurizecfg.uniform !0

  else:                                             ; preds = %entry
-    %else_ptr = getelementptr [8192 x i32], [8192 x i32] addrspace(5)* %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
-    %else_value = load i32, i32 addrspace(5)* %else_ptr, align 4
+    %else_ptr = getelementptr [8192 x i32], ptr addrspace(5) %scratch1, i32 0, i32 %else_offset, !amdgpu.uniform !0
+    %else_value = load i32, ptr addrspace(5) %else_ptr, align 4
    br label %done, !structurizecfg.uniform !0

  done:                                             ; preds = %else, %if
    %value = phi i32 [ %if_value, %if ], [ %else_value, %else ]
-    store i32 %value, i32 addrspace(1)* %out
+    store i32 %value, ptr addrspace(1) %out
    ret void
  }

@ -90,9 +90,9 @@ body:             |
    successors: %bb.1.if(0x30000000), %bb.2.else(0x50000000)
    liveins: $sgpr0_sgpr1, $sgpr3

-    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
+    $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 44, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
    $sgpr8 = S_MOV_B32 &SCRATCH_RSRC_DWORD0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-    $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
+    $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM $sgpr0_sgpr1, 36, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
    $sgpr9 = S_MOV_B32 &SCRATCH_RSRC_DWORD1, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
    $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
    $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
@ -110,7 +110,7 @@ body:             |
    successors: %bb.3.done(0x80000000)
    liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11

-    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 52, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
    S_WAITCNT 3855
    $vgpr0 = V_MOV_B32_e32 32772, implicit $exec
    S_BRANCH %bb.3.done
@ -119,7 +119,7 @@ body:             |
    successors: %bb.3.done(0x80000000)
    liveins: $sgpr0_sgpr1, $sgpr4_sgpr5, $sgpr3, $sgpr8_sgpr9_sgpr10_sgpr11

-    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `i32 addrspace(4)* undef`)
+    $sgpr0 = S_LOAD_DWORD_IMM killed $sgpr0_sgpr1, 48, 0 :: (non-temporal dereferenceable invariant load (s32) from `ptr addrspace(4) undef`)
    S_WAITCNT 3855
    $vgpr0 = V_MOV_B32_e32 4, implicit $exec

--- a/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-load-store.mir
@ -13,8 +13,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -32,9 +32,9 @@ body:             |
    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2, implicit %3
 ...

@ -54,10 +54,10 @@ body:             |
    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 2, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
 ...

@ -78,11 +78,11 @@ body:             |
    ; GCN-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[FLAT_LOAD_DWORD]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 3, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
 ...

@ -105,12 +105,12 @@ body:             |
    ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %5:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %6:vgpr_32 = FLAT_LOAD_DWORD %0, 20, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6
 ...

@ -126,8 +126,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4)
-    %2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 4)
+    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 4)
+    %2:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -143,8 +143,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `i128* undef`, align 8)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr undef`, align 8)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 24, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -160,8 +160,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX3_]].sub1_sub2
    ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `i64* undef`, align 8)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`, align 8)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -176,8 +176,8 @@ body:             |
    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:agpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:agpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -192,8 +192,8 @@ body:             |
    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -208,8 +208,8 @@ body:             |
    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 3, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -224,8 +224,8 @@ body:             |
    ; GCN-NEXT: [[FLAT_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[DEF]], 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[FLAT_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef`, align 4)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 1, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, align 4)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -243,8 +243,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -264,9 +264,9 @@ body:             |
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %1, 4, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %2, 8, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -283,10 +283,10 @@ body:             |
    ; GCN-NEXT: FLAT_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec, implicit $flat_scr :: (store (s128) into `ptr undef`, align 4)
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vreg_128 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -312,11 +312,11 @@ body:             |
    %3:agpr_32 = IMPLICIT_DEF
    %4:agpr_32 = IMPLICIT_DEF
    %5:agpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8)
-    FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %1, 4, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %2, 8, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 8)
+    FLAT_STORE_DWORD %0, %3, 12, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %4, 16, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %5, 20, 3, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -345,12 +345,12 @@ body:             |
    %4:vgpr_32 = IMPLICIT_DEF
    %5:vgpr_32 = IMPLICIT_DEF
    %6:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 8)
-    FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 8)
+    FLAT_STORE_DWORD %0, %2, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %3, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %4, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %5, 20, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0, %6, 24, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -367,8 +367,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vreg_64_align2 = IMPLICIT_DEF
    %2:vreg_64_align2 = IMPLICIT_DEF
-    FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4)
-    FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`, align 4)
+    FLAT_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`, align 4)
+    FLAT_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`, align 4)
 ...

 ---
@ -385,8 +385,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vreg_96_align2 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `i64* undef`, align 16)
-    FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr undef`, align 16)
+    FLAT_STORE_DWORD %0, killed %2, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -403,8 +403,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:agpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -421,8 +421,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -439,8 +439,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 2)
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 2)
 ...

 ---
@ -457,8 +457,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0, killed %1, 0, 1, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...

 ---
@ -475,6 +475,6 @@ body:             |
    %0:vreg_128_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
-    FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`, align 4)
+    FLAT_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
+    FLAT_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`, align 4)
 ...
--- a/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-flat-with-global-load-store.mir
@ -13,8 +13,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 4, basealign 4)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4, basealign 4)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -30,8 +30,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[FLAT_LOAD_DWORDX2_]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, basealign 8, addrspace 1)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4, basealign 8)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, basealign 8, addrspace 1)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4, basealign 8)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -49,9 +49,9 @@ body:             |
    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0
    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 16)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, basealign 16)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 8, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -71,10 +71,10 @@ body:             |
    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 8, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 12, basealign 8, addrspace 1)
-    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef` + 16)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 8, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 12, basealign 8, addrspace 1)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 16)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
 ...

@ -90,8 +90,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub2_sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `double* undef`)
-    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, addrspace 1)
+    %1:vreg_64_align2 = FLAT_LOAD_DWORDX2 %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s64) from `ptr undef`)
+    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -107,8 +107,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`)
-    %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `<3 x i32> addrspace(1)* undef`, addrspace 1)
+    %1:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`)
+    %2:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 4, 0, implicit $exec :: (load (s96) from `ptr addrspace(1) undef`, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -124,8 +124,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_96_align2 = COPY killed [[FLAT_LOAD_DWORDX4_]].sub1_sub2_sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
-    %2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `<3 x i32>* undef`)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+    %2:vreg_96_align2 = FLAT_LOAD_DWORDX3 %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s96) from `ptr undef`)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -144,9 +144,9 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[FLAT_LOAD_DWORD]], implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:sreg_64_xexec = IMPLICIT_DEF
-    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `float* undef`, basealign 4)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    %2:vgpr_32 = FLAT_LOAD_DWORD %0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef`, basealign 4)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3, implicit %4
 ...

@ -165,9 +165,9 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:sreg_64_xexec = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
-    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 4)
-    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `i32* undef` + 8)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %1, %0.sub0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
+    %3:vgpr_32 = FLAT_LOAD_DWORD %0, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 4)
+    %4:vgpr_32 = FLAT_LOAD_DWORD %0, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from `ptr undef` + 8)
    S_NOP 0, implicit %2, implicit %3, implicit %4
 ...

@ -184,8 +184,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
-    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, killed %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
 ...

 ---
@ -201,8 +201,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-    FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, killed %2, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
 ...

 ---
@ -218,8 +218,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vreg_64_align2 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
-    GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
+    GLOBAL_STORE_DWORDX2 %0, %2, 4, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, addrspace 1)
 ...

 ---
@ -235,8 +235,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vreg_96_align2 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
-    GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `<3 x i32> addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, %1, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
+    GLOBAL_STORE_DWORDX3 %0, %2, 4, 0, implicit $exec :: (store (s96) into `ptr addrspace(1) undef`, addrspace 1)
 ...

 ---
@ -252,8 +252,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vreg_64_align2 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-    FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `i64* undef`)
+    GLOBAL_STORE_DWORD %0, %1, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    FLAT_STORE_DWORDX2 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s64) into `ptr undef`)
 ...

 ---
@ -269,8 +269,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vreg_96_align2 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-    FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `<3 x i32>* undef`)
+    GLOBAL_STORE_DWORD %0, %1, 12, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    FLAT_STORE_DWORDX3 %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96) into `ptr undef`)
 ...

 ---
@ -288,8 +288,8 @@ body:             |
    %1:sreg_64_xexec = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
-    GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, %2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
+    GLOBAL_STORE_DWORD_SADDR %0.sub0, %3, %1, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
 ...

 ---
@ -307,6 +307,6 @@ body:             |
    %1:sreg_64_xexec = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, addrspace 1)
-    FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `i32* undef`)
+    GLOBAL_STORE_DWORD_SADDR %0.sub0, %2, %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, addrspace 1)
+    FLAT_STORE_DWORD %0, %3, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into `ptr undef`)
 ...
--- a/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-global-load-store.mir
@ -13,8 +13,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef` + 4, basealign 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, basealign 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, basealign 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -32,9 +32,9 @@ body:             |
    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2, implicit %3
 ...

@ -54,10 +54,10 @@ body:             |
    ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[COPY2]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4
 ...

@ -78,11 +78,11 @@ body:             |
    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[GLOBAL_LOAD_DWORD]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5
 ...

@ -105,12 +105,12 @@ body:             |
    ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub1
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6
 ...

@ -126,8 +126,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub2_sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -143,8 +143,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX4_]].sub3
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `i128 addrspace(1)* undef`, align 8, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vreg_96_align2 = GLOBAL_LOAD_DWORDX3 %0, 12, 0, implicit $exec :: (load (s96) from `ptr addrspace(1) undef`, align 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -160,8 +160,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64_align2 = COPY killed [[GLOBAL_LOAD_DWORDX3_]].sub1_sub2
    ; GCN-NEXT: S_NOP 0, implicit [[COPY1]], implicit [[COPY]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %1:vreg_64_align2 = GLOBAL_LOAD_DWORDX2 %0, 16, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 8, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -176,8 +176,8 @@ body:             |
    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:agpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:agpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -192,8 +192,8 @@ body:             |
    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -208,8 +208,8 @@ body:             |
    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 3, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 3, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -224,8 +224,8 @@ body:             |
    ; GCN-NEXT: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, addrspace 1)
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -243,8 +243,8 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3
 ...

@ -264,9 +264,9 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]], implicit [[COPY1]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 1, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3, implicit %4
 ...

@ -288,10 +288,10 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 0, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 2, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5
 ...

@ -316,12 +316,12 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[COPY4]], implicit [[COPY5]], implicit [[COPY3]], implicit [[COPY1]], implicit [[COPY6]], implicit [[COPY7]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 4, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 8, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %4:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 12, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %5:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 16, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %6:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 20, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %7:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1, 24, 3, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7
 ...

@ -339,8 +339,8 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
-    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vreg_64_align2 = GLOBAL_LOAD_DWORDX2_SADDR %0, %1, 8, 0, implicit $exec :: (load (s64) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3
 ...

@ -357,8 +357,8 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD]], implicit [[GLOBAL_LOAD_DWORD_SADDR]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vreg_64_align2 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3
 ...

@ -375,8 +375,8 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]]
    %0:sgpr_128 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub0_sub1, %1, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0.sub2_sub3, %1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3
 ...

@ -393,8 +393,8 @@ body:             |
    ; GCN-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORD_SADDR]], implicit [[GLOBAL_LOAD_DWORD_SADDR1]]
    %0:sreg_64_xexec = IMPLICIT_DEF
    %1:vreg_64_align2 = IMPLICIT_DEF
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %0, %1.sub1, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %2, implicit %3
 ...

@ -409,8 +409,8 @@ body:             |
    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed [[GLOBAL_LOAD_DWORDX2_]].sub0
    ; GCN-NEXT: S_NOP 0, implicit [[COPY]], implicit [[COPY1]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, basealign 8, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 4, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, basealign 8, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 4, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -428,9 +428,9 @@ body:             |
    ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[COPY]].sub0
    ; GCN-NEXT: S_NOP 0, implicit [[COPY2]], implicit [[COPY3]]
    %0:vreg_64_align2 = IMPLICIT_DEF
-    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 4, align 4, addrspace 1)
-    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `float addrspace(1)* undef`, align 16, addrspace 1)
-    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `i32 addrspace(1)* undef` + 8, align 8, addrspace 1)
+    %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 4, align 4, addrspace 1)
+    %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef`, align 16, addrspace 1)
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, implicit $exec :: (load (s32) from `ptr addrspace(1) undef` + 8, align 8, addrspace 1)
    S_NOP 0, implicit %1, implicit %2
 ...

@ -449,8 +449,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -470,9 +470,9 @@ body:             |
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 4, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, killed %2, 8, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %1, 4, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %2, 8, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, killed %3, 12, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -489,10 +489,10 @@ body:             |
    ; GCN-NEXT: GLOBAL_STORE_DWORDX4 [[DEF]], killed [[REG_SEQUENCE2]], 4, 2, implicit $exec :: (store (s128) into `ptr addrspace(1) undef`, align 4, addrspace 1)
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vreg_128 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %1.sub1, 8, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %1.sub2, 12, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %1.sub3, 16, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, %1.sub0, 4, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -518,11 +518,11 @@ body:             |
    %3:agpr_32 = IMPLICIT_DEF
    %4:agpr_32 = IMPLICIT_DEF
    %5:agpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, %1, 4, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %2, 8, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 8, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %3, 12, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %4, 16, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %5, 20, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %1, 4, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %2, 8, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 8, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %3, 12, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %4, 16, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %5, 20, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -551,12 +551,12 @@ body:             |
    %4:vgpr_32 = IMPLICIT_DEF
    %5:vgpr_32 = IMPLICIT_DEF
    %6:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, %1, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 8, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %2, 8, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %3, 12, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %4, 16, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %5, 20, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0, %6, 24, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %1, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 8, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %2, 8, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %3, 12, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %4, 16, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %5, 20, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, %6, 24, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -573,8 +573,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vreg_64_align2 = IMPLICIT_DEF
    %2:vreg_64_align2 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec :: (store (s64) into `i64 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %0, killed %1, 4, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORDX2 %0, killed %2, 12, 0, implicit $exec :: (store (s64) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -591,8 +591,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vreg_96_align2 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec :: (store (s96) into `i64 addrspace(1)* undef`, align 16, addrspace 1)
-    GLOBAL_STORE_DWORD %0, killed %2, 16, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORDX3 %0, killed %1, 4, 0, implicit $exec :: (store (s96) into `ptr addrspace(1) undef`, align 16, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %2, 16, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -609,8 +609,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:agpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -627,8 +627,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 6, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -645,8 +645,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 2, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 2, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 2, addrspace 1)
 ...

 ---
@ -663,8 +663,8 @@ body:             |
    %0:vreg_64_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0, killed %1, 0, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0, killed %1, 0, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD killed %0, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -681,8 +681,8 @@ body:             |
    %0:vreg_128_align2 = IMPLICIT_DEF
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0.sub0_sub1, killed %1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %0.sub2_sub3, killed %2, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -701,8 +701,8 @@ body:             |
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -724,9 +724,9 @@ body:             |
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
    %4:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 1, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -751,10 +751,10 @@ body:             |
    %3:vgpr_32 = IMPLICIT_DEF
    %4:vgpr_32 = IMPLICIT_DEF
    %5:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 2, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -785,12 +785,12 @@ body:             |
    %5:vgpr_32 = IMPLICIT_DEF
    %6:vgpr_32 = IMPLICIT_DEF
    %7:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %2, %0, 4, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %3, %0, 8, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %4, %0, 12, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %5, %0, 16, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %6, %0, 20, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %7, %0, 24, 3, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -809,8 +809,8 @@ body:             |
    %1:vreg_64_align2 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD %1, %3, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD %1, %3, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -829,8 +829,8 @@ body:             |
    %1:vreg_64_align2 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1.sub0, %2, %0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1.sub1, %3, %0, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...

 ---
@ -849,6 +849,6 @@ body:             |
    %1:vgpr_32 = IMPLICIT_DEF
    %2:vgpr_32 = IMPLICIT_DEF
    %3:vgpr_32 = IMPLICIT_DEF
-    GLOBAL_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
-    GLOBAL_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec :: (store (s32) into `i32 addrspace(1)* undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %2, %0.sub0_sub1, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
+    GLOBAL_STORE_DWORD_SADDR %1, %3, %0.sub2_sub3, 4, 0, implicit $exec :: (store (s32) into `ptr addrspace(1) undef`, align 4, addrspace 1)
 ...
--- a/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir
@ -29,10 +29,10 @@
      ret void

    bb2:
-      %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
-      %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
-      %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
-      %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
+      %tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0
+      %tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8
+      %tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16
+      %tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24
      br label %bb1
    }

@ -44,10 +44,10 @@
      ret void

    bb2:
-      %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
-      %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
-      %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
-      %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
+      %tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0
+      %tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8
+      %tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16
+      %tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24
      br label %bb1
    }

@ -59,10 +59,10 @@
      ret void

    bb2:
-      %tmp = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @0, i32 0, i32 0
-      %tmp1 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 8
-      %tmp2 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 16
-      %tmp3 = getelementptr inbounds float, float addrspace(3)* %tmp, i32 24
+      %tmp = getelementptr inbounds [256 x float], ptr addrspace(3) @0, i32 0, i32 0
+      %tmp1 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 8
+      %tmp2 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 16
+      %tmp3 = getelementptr inbounds float, ptr addrspace(3) %tmp, i32 24
      br label %bb1
    }
 ---
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@ -508,7 +508,7 @@ define amdgpu_ps void @v_omod_mul2_med3(float %x, float %y, float %z) #0 {
 ; GFX12-NEXT:    s_endpgm
  %fmed3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
  %div2 = fmul float %fmed3, 2.0
-  store float %div2, float addrspace(1)* undef
+  store float %div2, ptr addrspace(1) undef
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/opencl-printf-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf-unsupported.ll
@ -54,8 +54,8 @@ define amdgpu_kernel void @poison_interposable_initializer_gv(i32 %n) {
 define amdgpu_kernel void @not_constant_gv(i32 %n) {
 entry:
  %str = alloca [9 x i8], align 1, addrspace(5)
-  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0
-  %call1 = call i32 (i8 addrspace(4)*, ...) @printf(i8 addrspace(4)* getelementptr inbounds ([6 x i8], [6 x i8] addrspace(4)* @not.constant, i32 0, i32 0), i8 addrspace(5)* %arraydecay, i32 %n)
+  %arraydecay = getelementptr inbounds [9 x i8], ptr addrspace(5) %str, i32 0, i32 0
+  %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) @not.constant, ptr addrspace(5) %arraydecay, i32 %n)
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
@ -752,8 +752,8 @@ define amdgpu_kernel void @test_kernel_addrspacecasted_format_str(i32 %n) {
 ;
 entry:
  %str = alloca [9 x i8], align 1, addrspace(5)
-  %arraydecay = getelementptr inbounds [9 x i8], [9 x i8] addrspace(5)* %str, i32 0, i32 0
-  %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) addrspacecast (i8 addrspace(1)* getelementptr inbounds ([6 x i8], ptr addrspace(1) @str.as1, i32 0, i32 0) to ptr addrspace(4)), ptr addrspace(5) %arraydecay, i32 %n)
+  %arraydecay = getelementptr inbounds [9 x i8], ptr addrspace(5) %str, i32 0, i32 0
+  %call1 = call i32 (ptr addrspace(4), ...) @printf(ptr addrspace(4) addrspacecast (ptr addrspace(1) @str.as1 to ptr addrspace(4)), ptr addrspace(5) %arraydecay, i32 %n)
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@ -133,7 +133,7 @@ define amdgpu_vs void @promote_memmove_aggr() #0 {
  store float 1.0, ptr addrspace(5) %foo1
  %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
  store float 2.0, ptr addrspace(5) %foo2
-  call void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
+  call void @llvm.memmove.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo1, i32 16, i1 false)
  %foo3 = load float, ptr addrspace(5) %f1
  store float %foo3, ptr addrspace(1) @pv
  ret void
@ -160,7 +160,7 @@ define amdgpu_vs void @promote_memcpy_aggr() #0 {
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

-  call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
  %foo6 = load float, ptr addrspace(5) %f1
  store float %foo6, ptr addrspace(1) @pv
  ret void
@ -177,7 +177,7 @@ define amdgpu_vs void @promote_memcpy_identity_aggr() #0 {
  store float 1.0, ptr addrspace(5) %foo1
  %foo2 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 3
  store float 2.0, ptr addrspace(5) %foo2
-  call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %f1, i32 20, i1 false)
  %foo3 = load float, ptr addrspace(5) %f1
  store float %foo3, ptr addrspace(1) @pv
  ret void
@ -229,7 +229,7 @@ define amdgpu_vs void @promote_memcpy_two_aggrs() #0 {
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

-  call void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
+  call void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) align 4 %f2, ptr addrspace(5) align 4 %f1, i32 8, i1 false)

  %foo6 = getelementptr [5 x float], ptr addrspace(5) %f2, i32 0, i32 %foo4
  %foo7 = load float, ptr addrspace(5) %foo6
@ -266,7 +266,7 @@ define amdgpu_vs void @promote_memcpy_p1p5_aggr(ptr addrspace(1) inreg %src) #0
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

-  call void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
+  call void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) align 4 @pv, ptr addrspace(5) align 4 %f1, i32 8, i1 false)
  ret void
 }

@ -289,16 +289,16 @@ define amdgpu_vs void @promote_memcpy_inline_aggr() #0 {
  %foo5 = getelementptr [5 x float], ptr addrspace(5) %f1, i32 0, i32 %foo4
  store float 3.0, ptr addrspace(5) %foo5

-  call void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
+  call void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) align 4 %f1, ptr addrspace(5) align 4 %foo2, i32 8, i1 false)
  %foo6 = load float, ptr addrspace(5) %f1
  store float %foo6, ptr addrspace(1) @pv
  ret void
 }

-declare void @llvm.memcpy.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
-declare void @llvm.memcpy.p1i8.p5i8.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
-declare void @llvm.memcpy.inline.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
-declare void @llvm.memmove.p5i8.p5i8.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
+declare void @llvm.memcpy.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
+declare void @llvm.memcpy.p1.p5.i32(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
+declare void @llvm.memcpy.inline.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)
+declare void @llvm.memmove.p5.p5.i32(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i32, i1 immarg)

@tmp_g = external addrspace(1) global { [4 x double], <2 x double>, <3 x double>, <4 x double> }
@frag_color = external addrspace(1) global <4 x float>
--- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@ -11,15 +11,15 @@

  @sched_dbg_value_crash.tmp6 = internal unnamed_addr addrspace(3) global [256 x [16 x i8]] undef, align 16

-  define amdgpu_kernel void @sched_dbg_value_crash(i8 addrspace(1)* nocapture readonly %arg, i32 addrspace(1)* nocapture readonly %arg1, %struct.widget.0 addrspace(1)* nocapture readonly %arg2, %struct.baz addrspace(1)* nocapture readonly %arg3, %struct.snork addrspace(1)* nocapture %arg4) local_unnamed_addr #2 {
+  define amdgpu_kernel void @sched_dbg_value_crash(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture readonly %arg1, ptr addrspace(1) nocapture readonly %arg2, ptr addrspace(1) nocapture readonly %arg3, ptr addrspace(1) nocapture %arg4) local_unnamed_addr #2 {
  bb:
-    %0 = getelementptr i32, i32 addrspace(1)* %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3
+    %0 = getelementptr i32, ptr addrspace(1) %arg1, i64 0, !amdgpu.uniform !3, !amdgpu.noclobber !3
    %tmp5 = alloca %struct.wombat, align 16, addrspace(5)
-    %1 = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
-    %2 = bitcast i8 addrspace(4)* %1 to i32 addrspace(4)*
-    %3 = getelementptr inbounds i32, i32 addrspace(4)* %2, i64 1
-    %4 = bitcast i32 addrspace(4)* %3 to <2 x i32> addrspace(4)*, !amdgpu.uniform !3, !amdgpu.noclobber !3
-    %5 = load <2 x i32>, <2 x i32> addrspace(4)* %4, align 4, !invariant.load !3
+    %1 = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
+    %2 = bitcast ptr addrspace(4) %1 to ptr addrspace(4)
+    %3 = getelementptr inbounds i32, ptr addrspace(4) %2, i64 1
+    %4 = bitcast ptr addrspace(4) %3 to ptr addrspace(4), !amdgpu.uniform !3, !amdgpu.noclobber !3
+    %5 = load <2 x i32>, ptr addrspace(4) %4, align 4, !invariant.load !3
    %6 = extractelement <2 x i32> %5, i32 0
    %7 = extractelement <2 x i32> %5, i32 1
    %8 = lshr i32 %6, 16
@ -31,69 +31,69 @@
    %14 = mul nuw nsw i32 %10, %7
    %15 = add i32 %13, %14
    %16 = add i32 %15, %11
-    %17 = getelementptr inbounds [256 x [16 x i8]], [256 x [16 x i8]] addrspace(3)* @sched_dbg_value_crash.tmp6, i32 0, i32 %16
-    %tmp7 = load i64, i64 addrspace(4)* null, align 536870912
+    %17 = getelementptr inbounds [256 x [16 x i8]], ptr addrspace(3) @sched_dbg_value_crash.tmp6, i32 0, i32 %16
+    %tmp7 = load i64, ptr addrspace(4) null, align 536870912
    %tmp8 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !4
    %tmp9 = zext i32 %tmp8 to i64
    %tmp10 = add i64 %tmp7, %tmp9
    %tmp11 = shl i64 %tmp10, 32
    %tmp12 = ashr exact i64 %tmp11, 32
-    %tmp13 = getelementptr inbounds %struct.widget.0, %struct.widget.0 addrspace(1)* %arg2, i64 %tmp12, i32 1
-    %tmp14 = load i32, i32 addrspace(1)* %tmp13, align 4
-    %tmp15 = getelementptr inbounds %struct.baz, %struct.baz addrspace(1)* %arg3, i64 %tmp12, i32 1
-    %tmp16 = load <4 x float>, <4 x float> addrspace(1)* %tmp15, align 16
+    %tmp13 = getelementptr inbounds %struct.widget.0, ptr addrspace(1) %arg2, i64 %tmp12, i32 1
+    %tmp14 = load i32, ptr addrspace(1) %tmp13, align 4
+    %tmp15 = getelementptr inbounds %struct.baz, ptr addrspace(1) %arg3, i64 %tmp12, i32 1
+    %tmp16 = load <4 x float>, ptr addrspace(1) %tmp15, align 16
    %tmp17 = sext i32 %tmp14 to i64
-    %tmp18 = load i32, i32 addrspace(1)* %0, align 4
+    %tmp18 = load i32, ptr addrspace(1) %0, align 4
    %tmp19 = zext i32 %tmp18 to i64
    %tmp20 = shl nuw nsw i64 %tmp19, 2
-    %tmp21 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp20
-    %tmp22 = bitcast i8 addrspace(1)* %tmp21 to %struct.wombat.1 addrspace(1)*
-    %tmp23 = bitcast %struct.wombat addrspace(5)* %tmp5 to i8 addrspace(5)*
-    call void @llvm.lifetime.start.p5i8(i64 144, i8 addrspace(5)* nonnull %tmp23) #3
-    %tmp24 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(5)* %tmp5, i32 0, i32 6
-    %tmp25 = getelementptr i32, i32 addrspace(1)* %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3
-    %tmp26 = load i32, i32 addrspace(1)* %tmp25, align 4
+    %tmp21 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp20
+    %tmp22 = bitcast ptr addrspace(1) %tmp21 to ptr addrspace(1)
+    %tmp23 = bitcast ptr addrspace(5) %tmp5 to ptr addrspace(5)
+    call void @llvm.lifetime.start.p5(i64 144, ptr addrspace(5) nonnull %tmp23) #3
+    %tmp24 = getelementptr inbounds %struct.wombat, ptr addrspace(5) %tmp5, i32 0, i32 6
+    %tmp25 = getelementptr i32, ptr addrspace(1) %arg1, i64 3, !amdgpu.uniform !3, !amdgpu.noclobber !3
+    %tmp26 = load i32, ptr addrspace(1) %tmp25, align 4
    %tmp27 = zext i32 %tmp26 to i64
    %tmp28 = shl nuw nsw i64 %tmp27, 2
-    %tmp29 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp28
-    %tmp30 = bitcast i8 addrspace(1)* %tmp29 to <2 x float> addrspace(1)*
-    %tmp31 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 2, i64 0
-    %18 = bitcast i32 addrspace(1)* %tmp31 to <3 x i32> addrspace(1)*
-    %19 = load <3 x i32>, <3 x i32> addrspace(1)* %18, align 4
+    %tmp29 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp28
+    %tmp30 = bitcast ptr addrspace(1) %tmp29 to ptr addrspace(1)
+    %tmp31 = getelementptr inbounds %struct.wombat.1, ptr addrspace(1) %tmp22, i64 %tmp17, i32 2, i64 0
+    %18 = bitcast ptr addrspace(1) %tmp31 to ptr addrspace(1)
+    %19 = load <3 x i32>, ptr addrspace(1) %18, align 4
    %tmp325 = extractelement <3 x i32> %19, i32 0
    %tmp386 = extractelement <3 x i32> %19, i32 1
    %tmp447 = extractelement <3 x i32> %19, i32 2
    %tmp33 = sext i32 %tmp325 to i64
-    %tmp34 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp33
-    %tmp35 = load <2 x float>, <2 x float> addrspace(1)* %tmp34, align 8
+    %tmp34 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp33
+    %tmp35 = load <2 x float>, ptr addrspace(1) %tmp34, align 8
    %tmp36 = extractelement <2 x float> %tmp35, i32 1
    %tmp39 = sext i32 %tmp386 to i64
-    %tmp40 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp39
-    %tmp41 = load <2 x float>, <2 x float> addrspace(1)* %tmp40, align 8
+    %tmp40 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp39
+    %tmp41 = load <2 x float>, ptr addrspace(1) %tmp40, align 8
    %tmp42 = extractelement <2 x float> %tmp41, i32 1
    %tmp45 = sext i32 %tmp447 to i64
-    %tmp46 = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %tmp30, i64 %tmp45
-    %tmp47 = load <2 x float>, <2 x float> addrspace(1)* %tmp46, align 8
+    %tmp46 = getelementptr inbounds <2 x float>, ptr addrspace(1) %tmp30, i64 %tmp45
+    %tmp47 = load <2 x float>, ptr addrspace(1) %tmp46, align 8
    %tmp48 = extractelement <2 x float> %tmp47, i32 1
-    %tmp49 = getelementptr i32, i32 addrspace(1)* %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3
-    %tmp50 = load i32, i32 addrspace(1)* %tmp49, align 4
+    %tmp49 = getelementptr i32, ptr addrspace(1) %arg1, i64 1, !amdgpu.uniform !3, !amdgpu.noclobber !3
+    %tmp50 = load i32, ptr addrspace(1) %tmp49, align 4
    %tmp51 = zext i32 %tmp50 to i64
    %tmp52 = shl nuw nsw i64 %tmp51, 2
-    %tmp53 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 %tmp52
-    %tmp54 = bitcast i8 addrspace(1)* %tmp53 to <4 x float> addrspace(1)*
-    %tmp55 = getelementptr inbounds %struct.wombat.1, %struct.wombat.1 addrspace(1)* %tmp22, i64 %tmp17, i32 0, i64 0
-    %20 = bitcast i32 addrspace(1)* %tmp55 to <2 x i32> addrspace(1)*
-    %21 = load <2 x i32>, <2 x i32> addrspace(1)* %20, align 4
+    %tmp53 = getelementptr inbounds i8, ptr addrspace(1) %arg, i64 %tmp52
+    %tmp54 = bitcast ptr addrspace(1) %tmp53 to ptr addrspace(1)
+    %tmp55 = getelementptr inbounds %struct.wombat.1, ptr addrspace(1) %tmp22, i64 %tmp17, i32 0, i64 0
+    %20 = bitcast ptr addrspace(1) %tmp55 to ptr addrspace(1)
+    %21 = load <2 x i32>, ptr addrspace(1) %20, align 4
    %tmp568 = extractelement <2 x i32> %21, i32 0
    %tmp639 = extractelement <2 x i32> %21, i32 1
    %tmp57 = sext i32 %tmp568 to i64
-    %tmp58 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp57
-    %tmp59 = load <4 x float>, <4 x float> addrspace(1)* %tmp58, align 16
+    %tmp58 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp54, i64 %tmp57
+    %tmp59 = load <4 x float>, ptr addrspace(1) %tmp58, align 16
    %tmp60 = extractelement <4 x float> %tmp59, i32 0
    %tmp61 = extractelement <4 x float> %tmp59, i32 1
    %tmp64 = sext i32 %tmp639 to i64
-    %tmp65 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %tmp54, i64 %tmp64
-    %tmp66 = load <4 x float>, <4 x float> addrspace(1)* %tmp65, align 16
+    %tmp65 = getelementptr inbounds <4 x float>, ptr addrspace(1) %tmp54, i64 %tmp64
+    %tmp66 = load <4 x float>, ptr addrspace(1) %tmp65, align 16
    %tmp67 = extractelement <4 x float> %tmp16, i64 0
    %tmp69 = fsub fast float -0.000000e+00, %tmp67
    %tmp70 = fmul float %tmp67, 0.000000e+00
@ -103,7 +103,7 @@
    %tmp74 = insertelement <4 x float> <float undef, float undef, float undef, float 0.000000e+00>, float %tmp69, i32 0
    %tmp75 = insertelement <4 x float> %tmp74, float %tmp71, i32 1
    %tmp76 = insertelement <4 x float> %tmp75, float %tmp73, i32 2
-    store <4 x float> %tmp76, <4 x float> addrspace(5)* %tmp24, align 16
+    store <4 x float> %tmp76, ptr addrspace(5) %tmp24, align 16
    %tmp77 = fsub float undef, %tmp60
    %tmp78 = fsub float undef, %tmp61
    %tmp79 = extractelement <4 x float> %tmp66, i32 2
@ -128,27 +128,27 @@
    %fadd = fadd <2 x float> %fmul, undef
    %extractelement = extractelement <2 x float> %fadd, i64 1
    %tmp96 = fsub float %extractelement, %tmp95
-    %tmp97 = getelementptr inbounds %struct.wombat, %struct.wombat addrspace(5)* %tmp5, i32 0, i32 8, i32 1
-    call void @func(float %tmp96, i64 0, i16 addrspace(5)* nonnull %tmp97) #3
-    %tmp984 = bitcast [16 x i8] addrspace(3)* %17 to i8 addrspace(3)*
-    %tmp99 = getelementptr inbounds %struct.snork, %struct.snork addrspace(1)* %arg4, i64 %tmp12, i32 8, i32 1, i64 0
-    call void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* %tmp99, i8 addrspace(3)* %tmp984, i64 16, i32 16, i1 false)
-    call void @llvm.lifetime.end.p5i8(i64 144, i8 addrspace(5)* nonnull %tmp23) #3
+    %tmp97 = getelementptr inbounds %struct.wombat, ptr addrspace(5) %tmp5, i32 0, i32 8, i32 1
+    call void @func(float %tmp96, i64 0, ptr addrspace(5) nonnull %tmp97) #3
+    %tmp984 = bitcast ptr addrspace(3) %17 to ptr addrspace(3)
+    %tmp99 = getelementptr inbounds %struct.snork, ptr addrspace(1) %arg4, i64 %tmp12, i32 8, i32 1, i64 0
+    call void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) %tmp99, ptr addrspace(3) %tmp984, i64 16, i32 16, i1 false)
+    call void @llvm.lifetime.end.p5(i64 144, ptr addrspace(5) nonnull %tmp23) #3
    ret void
  }

-  declare void @func(float, i64, i16 addrspace(5)*)
-  declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #0
+  declare void @func(float, i64, ptr addrspace(5))
+  declare void @llvm.lifetime.start.p5(i64, ptr addrspace(5) nocapture) #0
  declare float @llvm.fmuladd.f32(float, float, float) #1
-  declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #0
+  declare void @llvm.lifetime.end.p5(i64, ptr addrspace(5) nocapture) #0
  declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1
  declare i32 @llvm.amdgcn.workitem.id.x() #1
  declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-  declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+  declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
  declare i32 @llvm.amdgcn.workitem.id.y() #1
  declare i32 @llvm.amdgcn.workitem.id.z() #1
-  declare void @llvm.memcpy.p1i8.p5i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(5)* nocapture readonly, i64, i32, i1) #0
-  declare void @llvm.memcpy.p1i8.p3i8.i64(i8 addrspace(1)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i64, i32, i1) #0
+  declare void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i32, i1) #0
+  declare void @llvm.memcpy.p1.p3.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(3) nocapture readonly, i64, i32, i1) #0

  attributes #0 = { argmemonly nounwind }
  attributes #1 = { nounwind readnone speculatable }
@ -203,9 +203,9 @@ body:             |
    %2:vgpr_32 = COPY $vgpr2
    %1:vgpr_32 = COPY $vgpr1
    %0:vgpr_32 = COPY $vgpr0
-    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
-    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
-    %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
+    %5:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
+    %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 8, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
+    %7:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 16, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
    %8:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 24, 0
    %9:sreg_64_xexec = S_LOAD_DWORDX2_IMM %4, 32, 0
    %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %3, 4, 0
--- a/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
@ -26,7 +26,7 @@
  source_filename = "sdwa-scalar-ops.opt.ll"
  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"

-  define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) {
+  define amdgpu_kernel void @sdwa_imm_operand(ptr addrspace(1) nocapture %arg) {
  bb:
    br label %bb2

@ -35,29 +35,29 @@

  bb2:                                              ; preds = %bb2, %bb
    %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
-    %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
-    %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
-    %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
-    %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
+    %bc = bitcast ptr addrspace(1) %arg to ptr addrspace(1)
+    %uglygep4 = getelementptr i8, ptr addrspace(1) %bc, i64 %lsr.iv
+    %uglygep45 = bitcast ptr addrspace(1) %uglygep4 to ptr addrspace(1)
+    %tmp5 = load i32, ptr addrspace(1) %uglygep45, align 4
    %tmp6 = lshr i32 %tmp5, 8
    %tmp7 = and i32 %tmp6, 255
    %tmp8 = zext i32 %tmp7 to i64
-    %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
-    store i32 1, i32 addrspace(1)* %tmp9, align 4
-    %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
-    %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
+    %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp8
+    store i32 1, ptr addrspace(1) %tmp9, align 4
+    %scevgep = getelementptr i32, ptr addrspace(1) %uglygep45, i64 1
+    %tmp13 = load i32, ptr addrspace(1) %scevgep, align 4
    %tmp14 = lshr i32 %tmp13, 8
    %tmp15 = and i32 %tmp14, 255
    %tmp16 = zext i32 %tmp15 to i64
-    %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
-    store i32 1, i32 addrspace(1)* %tmp17, align 4
+    %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16
+    store i32 1, ptr addrspace(1) %tmp17, align 4
    %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
    %tmp1 = trunc i64 %lsr.iv.next to i32
    %tmp19 = icmp eq i32 %tmp1, 4096
    br i1 %tmp19, label %bb1, label %bb2
  }

-  define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) {
+  define amdgpu_kernel void @sdwa_sgpr_operand(ptr addrspace(1) nocapture %arg) {
  bb:
    br label %bb2

@ -66,22 +66,22 @@

  bb2:                                              ; preds = %bb2, %bb
    %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
-    %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
-    %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
-    %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
-    %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
+    %bc = bitcast ptr addrspace(1) %arg to ptr addrspace(1)
+    %uglygep4 = getelementptr i8, ptr addrspace(1) %bc, i64 %lsr.iv
+    %uglygep45 = bitcast ptr addrspace(1) %uglygep4 to ptr addrspace(1)
+    %tmp5 = load i32, ptr addrspace(1) %uglygep45, align 4
    %tmp6 = lshr i32 %tmp5, 8
    %tmp7 = and i32 %tmp6, 255
    %tmp8 = zext i32 %tmp7 to i64
-    %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
-    store i32 1, i32 addrspace(1)* %tmp9, align 4
-    %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
-    %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
+    %tmp9 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp8
+    store i32 1, ptr addrspace(1) %tmp9, align 4
+    %scevgep = getelementptr i32, ptr addrspace(1) %uglygep45, i64 1
+    %tmp13 = load i32, ptr addrspace(1) %scevgep, align 4
    %tmp14 = lshr i32 %tmp13, 8
    %tmp15 = and i32 %tmp14, 255
    %tmp16 = zext i32 %tmp15 to i64
-    %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
-    store i32 1, i32 addrspace(1)* %tmp17, align 4
+    %tmp17 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp16
+    store i32 1, ptr addrspace(1) %tmp17, align 4
    %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
    %tmp1 = trunc i64 %lsr.iv.next to i32
    %tmp19 = icmp eq i32 %tmp1, 4096
@ -203,7 +203,7 @@ body:             |
    liveins: $sgpr4_sgpr5

    %4 = COPY $sgpr4_sgpr5
-    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
+    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
    %8 = S_MOV_B64 0
    %7 = COPY %9
    %30 = V_MOV_B32_e32 1, implicit $exec
@ -365,7 +365,7 @@ body:             |
    liveins: $sgpr4_sgpr5

    %4 = COPY $sgpr4_sgpr5
-    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `i64 addrspace(4)* undef`)
+    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load (s64) from `ptr addrspace(4) undef`)
    %8 = S_MOV_B64 0
    %7 = COPY %9
    %30 = V_MOV_B32_e32 1, implicit $exec
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -14,11 +14,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -31,11 +31,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -48,11 +48,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -65,11 +65,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
@ -82,11 +82,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16>
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
@ -99,11 +99,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16>
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -114,11 +114,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half>
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -129,11 +129,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half>
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
@ -144,11 +144,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half>
 bb:
  %fneg.C = fneg <8 x half> %C
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
@ -159,11 +159,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half>
 bb:
  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -176,11 +176,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -193,11 +193,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -210,11 +210,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -227,11 +227,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -244,11 +244,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -261,11 +261,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
@ -278,11 +278,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i3
 bb:
  %fneg.C = fneg <8 x float> %C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
@ -295,11 +295,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i3
 bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -329,11 +329,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 bb:
  %fneg.B = fneg <16 x half> %B
  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -344,11 +344,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x ha
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -359,13 +359,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x ha
 bb:
  %fneg.B = fneg <16 x half> %B
  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; both neg and abs patterns (wmma matrix C f32 or f16 )

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -379,11 +379,11 @@ bb:
  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
  %fneg.fabs.C = fneg <8 x float> %fabs.C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -395,11 +395,11 @@ bb:
  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
  %fneg.fabs.C = fneg <8 x half> %fabs.C
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
@ -417,13 +417,13 @@ bb:
  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

 ; A or B matrix modifier and constant in C

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -436,11 +436,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A,
 bb:
  %fneg.A = fneg <8 x half> %A
  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  store <8 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -451,13 +451,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A,
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; pack f16 elements with v_perm_b32 since they don't come from same b32

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    s_clause 0x1
@ -480,7 +480,7 @@ bb:
  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
-  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  store <8 x half> %res, ptr addrspace(1) %out
  ret void
 }

--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -16,7 +16,7 @@ bb:
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -27,11 +27,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 bb:
  %fneg.B = fneg <4 x half> %B
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -42,11 +42,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -57,11 +57,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
@ -72,11 +72,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16>
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
@ -87,11 +87,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16>
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -102,11 +102,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half>
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -117,11 +117,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half>
 bb:
  %fneg.B = fneg <4 x half> %B
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
@ -132,11 +132,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half>
 bb:
  %fneg.C = fneg <4 x half> %C
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
@ -147,11 +147,11 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half>
 bb:
  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -162,11 +162,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -177,11 +177,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -192,11 +192,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -207,11 +207,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -222,11 +222,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -237,11 +237,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
@ -252,11 +252,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x
 bb:
  %fneg.C = fneg <4 x float> %C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fneg.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
@ -267,11 +267,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x
 bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 %A, i32 %B, <4 x float> %fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -282,11 +282,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -297,11 +297,11 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -312,11 +312,11 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x hal
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -327,13 +327,13 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x hal
 bb:
  %fneg.B = fneg <8 x half> %B
  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; both neg and abs patterns (wmma matrix C f32 or f16 )

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -345,11 +345,11 @@ bb:
  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
  %fneg.fabs.C = fneg <4 x float> %fabs.C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
@ -361,11 +361,11 @@ bb:
  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
  %fneg.fabs.C = fneg <4 x half> %fabs.C
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
@ -381,13 +381,13 @@ bb:
  %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
  %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

 ; A or B matrix modifier and constant in C

-define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
@ -398,11 +398,11 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A,
 bb:
  %fneg.A = fneg <4 x half> %A
  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  store <4 x float> %res, ptr addrspace(1) %out
  ret void
 }

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
@ -413,13 +413,13 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A,
 bb:
  %fneg.B = fneg <4 x half> %B
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }

 ; pack f16 elements with v_perm_b32 since they don't come from same b32

-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, ptr addrspace(1) %out) {
 ; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
 ; GFX12:       ; %bb.0: ; %bb
 ; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
@ -437,7 +437,7 @@ bb:
  %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
  %fneg.C_shuffle = fneg <4 x half> %C_shuffle
  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
-  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  store <4 x half> %res, ptr addrspace(1) %out
  ret void
 }