[NFC][AMDGPU] Update tests to use autogened CHECKs (#140311)

2025-05-16 17:22:19 -07:00 · 2025-05-16 17:22:19 -07:00 · 437195efbf
commit 437195efbf
parent 286ab11dc6
9 changed files with 11027 additions and 1582 deletions
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX7 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,FUNC,GFX8 %s

@ -10,21 +11,70 @@
 ; Instructions with B32, U32, and I32 in their name take 32-bit operands, while
 ; instructions with B64, U64, and I64 take 64-bit operands.

-; FUNC-LABEL: {{^}}local_address_load:
-; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
-; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define amdgpu_kernel void @local_address_load(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GFX7-LABEL: local_address_load:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_load:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
 entry:
  %0 = load i32, ptr addrspace(3) %in
  store i32 %0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}local_address_gep:
-; SI: s_add_i32 [[SPTR:s[0-9]]]
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_read_b32 [[VPTR]]
 define amdgpu_kernel void @local_address_gep(ptr addrspace(1) %out, ptr addrspace(3) %in, i32 %offset) {
+; GFX7-LABEL: local_address_gep:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s3, s3, 2
+; GFX7-NEXT:    s_add_i32 s2, s2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_gep:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b32 s3, s3, 2
+; GFX8-NEXT:    s_add_i32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
 entry:
  %0 = getelementptr i32, ptr addrspace(3) %in, i32 %offset
  %1 = load i32, ptr addrspace(3) %0
@ -32,10 +82,34 @@ entry:
  ret void
 }

-; FUNC-LABEL: {{^}}local_address_gep_const_offset:
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
 define amdgpu_kernel void @local_address_gep_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GFX7-LABEL: local_address_gep_const_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v0, v0 offset:4
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_gep_const_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    ds_read_b32 v0, v0 offset:4
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
 entry:
  %0 = getelementptr i32, ptr addrspace(3) %in, i32 1
  %1 = load i32, ptr addrspace(3) %0
@ -44,11 +118,36 @@ entry:
 }

 ; Offset too large, can't fold into 16-bit immediate offset.
-; FUNC-LABEL: {{^}}local_address_gep_large_const_offset:
-; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_read_b32 [[VPTR]]
 define amdgpu_kernel void @local_address_gep_large_const_offset(ptr addrspace(1) %out, ptr addrspace(3) %in) {
+; GFX7-LABEL: local_address_gep_large_const_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s2, s2, 0x10004
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_gep_large_const_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s2, s2, 0x10004
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
 entry:
  %0 = getelementptr i32, ptr addrspace(3) %in, i32 16385
  %1 = load i32, ptr addrspace(3) %0
@ -56,24 +155,70 @@ entry:
  ret void
 }

-; FUNC-LABEL: {{^}}null_32bit_lds_ptr:
-; GFX7 v_cmp_ne_u32
-; GFX7: s_cselect_b32
-; GFX8: s_cmp_lg_u32
-; GFX8-NOT: v_cmp_ne_u32
-; GFX8: s_cselect_b32
 define amdgpu_kernel void @null_32bit_lds_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds) nounwind {
+; GFX7-LABEL: null_32bit_lds_ptr:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_movk_i32 s4, 0x7b
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX7-NEXT:    s_cselect_b32 s4, s4, 0x1c8
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: null_32bit_lds_ptr:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_movk_i32 s4, 0x7b
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX8-NEXT:    s_cselect_b32 s4, s4, 0x1c8
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
  %cmp = icmp ne ptr addrspace(3) %lds, null
  %x = select i1 %cmp, i32 123, i32 456
  store i32 %x, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}mul_32bit_ptr:
-; SI: s_mul_i32
-; SI-NEXT: s_add_i32
-; SI: ds_read_b32
 define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3) %lds, i32 %tid) {
+; GFX7-LABEL: mul_32bit_ptr:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mul_i32 s3, s3, 12
+; GFX7-NEXT:    s_add_i32 s2, s2, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: mul_32bit_ptr:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mul_i32 s3, s3, 12
+; GFX8-NEXT:    s_add_i32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
  %ptr = getelementptr [3 x float], ptr addrspace(3) %lds, i32 %tid, i32 0
  %val = load float, ptr addrspace(3) %ptr
  store float %val, ptr addrspace(1) %out
@ -82,60 +227,156 @@ define amdgpu_kernel void @mul_32bit_ptr(ptr addrspace(1) %out, ptr addrspace(3)

@g_lds = addrspace(3) global float poison, align 4

-; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
-; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
-; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define amdgpu_kernel void @infer_ptr_alignment_global_offset(ptr addrspace(1) %out, i32 %tid) {
+; GFX7-LABEL: infer_ptr_alignment_global_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    ds_read_b32 v0, v0
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: infer_ptr_alignment_global_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
  %val = load float, ptr addrspace(3) @g_lds
  store float %val, ptr addrspace(1) %out
  ret void
 }

-
@ptr = addrspace(3) global ptr addrspace(3) poison
@dst = addrspace(3) global [16383 x i32] poison

-; FUNC-LABEL: {{^}}global_ptr:
-; SI: ds_write_b32
 define amdgpu_kernel void @global_ptr() nounwind {
+; SI-LABEL: global_ptr:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_mov_b32_e32 v0, 64
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 m0, -1
+; SI-NEXT:    ds_write_b32 v1, v0 offset:65532
+; SI-NEXT:    s_endpgm
  store ptr addrspace(3) getelementptr ([16383 x i32], ptr addrspace(3) @dst, i32 0, i32 16), ptr addrspace(3) @ptr
  ret void
 }

-; FUNC-LABEL: {{^}}local_address_store:
-; SI: ds_write_b32
 define amdgpu_kernel void @local_address_store(ptr addrspace(3) %out, i32 %val) {
+; GFX7-LABEL: local_address_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    ds_write_b32 v0, v1
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    ds_write_b32 v0, v1
+; GFX8-NEXT:    s_endpgm
  store i32 %val, ptr addrspace(3) %out
  ret void
 }

-; FUNC-LABEL: {{^}}local_address_gep_store:
-; SI: s_add_i32 [[SADDR:s[0-9]+]],
-; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
-; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
 define amdgpu_kernel void @local_address_gep_store(ptr addrspace(3) %out, i32, i32 %val, i32 %offset) {
+; GFX7-LABEL: local_address_gep_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xb
+; GFX7-NEXT:    s_load_dword s2, s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_add_i32 s0, s2, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_gep_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_add_i32 s0, s2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    ds_write_b32 v1, v0
+; GFX8-NEXT:    s_endpgm
  %gep = getelementptr i32, ptr addrspace(3) %out, i32 %offset
  store i32 %val, ptr addrspace(3) %gep, align 4
  ret void
 }

-; FUNC-LABEL: {{^}}local_address_gep_const_offset_store:
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
-; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
-; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
 define amdgpu_kernel void @local_address_gep_const_offset_store(ptr addrspace(3) %out, i32 %val) {
+; GFX7-LABEL: local_address_gep_const_offset_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    ds_write_b32 v0, v1 offset:4
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_gep_const_offset_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    ds_write_b32 v0, v1 offset:4
+; GFX8-NEXT:    s_endpgm
  %gep = getelementptr i32, ptr addrspace(3) %out, i32 1
  store i32 %val, ptr addrspace(3) %gep, align 4
  ret void
 }

 ; Offset too large, can't fold into 16-bit immediate offset.
-; FUNC-LABEL: {{^}}local_address_gep_large_const_offset_store:
-; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
-; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
-; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
 define amdgpu_kernel void @local_address_gep_large_const_offset_store(ptr addrspace(3) %out, i32 %val) {
+; GFX7-LABEL: local_address_gep_large_const_offset_store:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 m0, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_i32 s0, s0, 0x10004
+; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s0
+; GFX7-NEXT:    ds_write_b32 v1, v0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: local_address_gep_large_const_offset_store:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_i32 s0, s0, 0x10004
+; GFX8-NEXT:    v_mov_b32_e32 v0, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    ds_write_b32 v1, v0
+; GFX8-NEXT:    s_endpgm
  %gep = getelementptr i32, ptr addrspace(3) %out, i32 16385
  store i32 %val, ptr addrspace(3) %gep, align 4
  ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@ -1,11 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s

-; GCN-LABEL: {{^}}select_and1:
-; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
-; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
-; GCN-NOT: v_and_b32
-; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
+; GCN-LABEL: select_and1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_gt_i32 s2, 10
+; GCN-NEXT:    s_cselect_b32 s2, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, i32 0, i32 -1
  %a = and i32 %y, %s
@ -13,12 +19,17 @@ define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
  ret void
 }

-; GCN-LABEL: {{^}}select_and2:
-; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
-; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
-; GCN-NOT: v_and_b32
-; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
+; GCN-LABEL: select_and2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_gt_i32 s2, 10
+; GCN-NEXT:    s_cselect_b32 s2, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, i32 0, i32 -1
  %a = and i32 %s, %y
@ -26,12 +37,17 @@ define amdgpu_kernel void @select_and2(ptr addrspace(1) %p, i32 %x, i32 %y) {
  ret void
 }

-; GCN-LABEL: {{^}}select_and3:
-; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
-; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
-; GCN-NOT: v_and_b32
-; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
+; GCN-LABEL: select_and3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lt_i32 s2, 11
+; GCN-NEXT:    s_cselect_b32 s2, s3, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, i32 -1, i32 0
  %a = and i32 %y, %s
@ -39,18 +55,25 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
  ret void
 }

-; GCN-LABEL: {{^}}select_and_v4:
-; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, 0
-; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, 0
-; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, 0
-; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, 0
-; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
-; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
-; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
-; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
-; GCN-NOT: v_and_b32
-; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
+; GCN-LABEL: select_and_v4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_gt_i32 s8, 10
+; GCN-NEXT:    s_cselect_b32 s3, s3, 0
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0
+; GCN-NEXT:    s_cselect_b32 s1, s1, 0
+; GCN-NEXT:    s_cselect_b32 s0, s0, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = and <4 x i32> %s, %y
@ -58,12 +81,17 @@ define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32>
  ret void
 }

-; GCN-LABEL: {{^}}select_or1:
-; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
-; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
-; GCN-NOT: v_or_b32
-; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
+; GCN-LABEL: select_or1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lt_i32 s2, 11
+; GCN-NEXT:    s_cselect_b32 s2, s3, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, i32 0, i32 -1
  %a = or i32 %y, %s
@ -71,12 +99,17 @@ define amdgpu_kernel void @select_or1(ptr addrspace(1) %p, i32 %x, i32 %y) {
  ret void
 }

-; GCN-LABEL: {{^}}select_or2:
-; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
-; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
-; GCN-NOT: v_or_b32
-; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
+; GCN-LABEL: select_or2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lt_i32 s2, 11
+; GCN-NEXT:    s_cselect_b32 s2, s3, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, i32 0, i32 -1
  %a = or i32 %s, %y
@ -84,12 +117,17 @@ define amdgpu_kernel void @select_or2(ptr addrspace(1) %p, i32 %x, i32 %y) {
  ret void
 }

-; GCN-LABEL: {{^}}select_or3:
-; GCN:     s_cselect_b32 [[SEL:s[0-9]+]], s{{[0-9]+}},
-; GCN:     v_mov_b32_e32 [[VSEL:v[0-9]+]], [[SEL]]
-; GCN-NOT: v_or_b32
-; GCN:     store_dword v{{[0-9]+}}, [[VSEL]], s{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
+; GCN-LABEL: select_or3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_gt_i32 s2, 10
+; GCN-NEXT:    s_cselect_b32 s2, s3, -1
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, i32 -1, i32 0
  %a = or i32 %y, %s
@ -97,18 +135,25 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
  ret void
 }

-; GCN-LABEL: {{^}}select_or_v4:
-; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], s{{[0-9]+}}, -1
-; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], s{{[0-9]+}}, -1
-; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], s{{[0-9]+}}, -1
-; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], s{{[0-9]+}}, -1
-; GCN-NOT: v_or_b32
-; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
-; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
-; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
-; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
-; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
+; GCN-LABEL: select_or_v4:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_cmp_lt_i32 s8, 11
+; GCN-NEXT:    s_cselect_b32 s3, s3, -1
+; GCN-NEXT:    s_cselect_b32 s2, s2, -1
+; GCN-NEXT:    s_cselect_b32 s1, s1, -1
+; GCN-NEXT:    s_cselect_b32 s0, s0, -1
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GCN-NEXT:    s_endpgm
  %c = icmp slt i32 %x, 11
  %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
  %a = or <4 x i32> %s, %y
@ -116,192 +161,360 @@ define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %
  ret void
 }

-; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sel_constants_sub_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 9, 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i32 -4, i32 3
  %bo = sub i32 5, %sel
  store i32 %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 9, 2
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 9, 2
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_short v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i16 -4, i16 3
  %bo = sub i16 5, %sel
  store i16 %bo, ptr addrspace(1) %p, align 2
  ret void
 }

-; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16_neg:
-; GCN: s_cselect_b32 s[[SGPR:[0-9]+]], s[[SGPR]], 0xf449
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16_neg(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sel_constants_sub_constant_sel_constants_i16_neg:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s2, 0xfffd
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0xf449
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_short v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i16 4, i16 3000
  %bo = sub i16 1, %sel
  store i16 %bo, ptr addrspace(1) %p, align 2
  ret void
 }

-; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v2i16:
-; GCN-DAG: s_mov_b32 [[T:s[0-9]+]], 0x50009
-; GCN:     s_cselect_b32 s{{[0-9]+}}, [[T]], 0x60002
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sel_constants_sub_constant_sel_constants_v2i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s2, 0x50009
+; GCN-NEXT:    s_cselect_b32 s2, s2, 0x60002
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, <2 x i16> <i16 -4, i16 2>, <2 x i16> <i16 3, i16 1>
  %bo = sub <2 x i16> <i16 5, i16 7>, %sel
  store <2 x i16> %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_v4i32:
-; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], 7, 14
-; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], 6, 10
-; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], 5, 6
-; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], 9, 2
-; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
-; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
-; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
-; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
-; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 7, 14
+; GCN-NEXT:    s_cselect_b32 s3, 6, 10
+; GCN-NEXT:    s_cselect_b32 s4, 5, 6
+; GCN-NEXT:    s_cselect_b32 s5, 9, 2
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
  %bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
  store <4 x i32> %bo, ptr addrspace(1) %p, align 32
  ret void
 }

-; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i64:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
 define amdgpu_kernel void @sdiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sdiv_constant_sel_constants_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 0, 5
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i64 121, i64 23
  %bo = sdiv i64 120, %sel
  store i64 %bo, ptr addrspace(1) %p, align 8
  ret void
 }

-; GCN-LABEL: {{^}}sdiv_constant_sel_constants_i32:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 26, 8
 define amdgpu_kernel void @sdiv_constant_sel_constants_i32(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: sdiv_constant_sel_constants_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 26, 8
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i32 7, i32 23
  %bo = sdiv i32 184, %sel
  store i32 %bo, ptr addrspace(1) %p, align 8
  ret void
 }

-; GCN-LABEL: {{^}}udiv_constant_sel_constants_i64:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 0, 5
 define amdgpu_kernel void @udiv_constant_sel_constants_i64(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: udiv_constant_sel_constants_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 0, 5
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i64 -4, i64 23
  %bo = udiv i64 120, %sel
  store i64 %bo, ptr addrspace(1) %p, align 8
  ret void
 }

-; GCN-LABEL: {{^}}srem_constant_sel_constants:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
 define amdgpu_kernel void @srem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: srem_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 33, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i64 34, i64 15
  %bo = srem i64 33, %sel
  store i64 %bo, ptr addrspace(1) %p, align 8
  ret void
 }

-; GCN-LABEL: {{^}}urem_constant_sel_constants:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 33, 3
 define amdgpu_kernel void @urem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: urem_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 33, 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i64 34, i64 15
  %bo = urem i64 33, %sel
  store i64 %bo, ptr addrspace(1) %p, align 8
  ret void
 }

-; GCN-LABEL: {{^}}shl_constant_sel_constants:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 4, 8
 define amdgpu_kernel void @shl_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: shl_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 4, 8
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i32 2, i32 3
  %bo = shl i32 1, %sel
  store i32 %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}lshr_constant_sel_constants:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 16, 8
 define amdgpu_kernel void @lshr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: lshr_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 16, 8
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i32 2, i32 3
  %bo = lshr i32 64, %sel
  store i32 %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}ashr_constant_sel_constants:
-; GCN: s_cselect_b32 s{{[0-9]+}}, 32, 16
 define amdgpu_kernel void @ashr_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: ashr_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 32, 16
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, i32 2, i32 3
  %bo = ashr i32 128, %sel
  store i32 %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}fsub_constant_sel_constants:
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, -4.0, 1.0,
 define amdgpu_kernel void @fsub_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: fsub_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, -4.0, 1.0, s[2:3]
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, float -2.0, float 3.0
  %bo = fsub float -1.0, %sel
  store float %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}fsub_constant_sel_constants_f16:
 ; TODO: it shall be possible to fold constants with OpSel
-; GCN-DAG: v_mov_b32_e32 [[T:v[0-9]+]], 0x3c00
-; GCN-DAG: v_mov_b32_e32 [[F:v[0-9]+]], 0xc400
-; GCN:     v_cndmask_b32_e32 v{{[0-9]+}}, [[F]], [[T]],
 define amdgpu_kernel void @fsub_constant_sel_constants_f16(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: fsub_constant_sel_constants_f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v1, 0xc400
+; GCN-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-NEXT:    global_store_short v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, half -2.0, half 3.0
  %bo = fsub half -1.0, %sel
  store half %bo, ptr addrspace(1) %p, align 2
  ret void
 }

-; GCN-LABEL: {{^}}fsub_constant_sel_constants_v2f16:
-; GCN:     s_cselect_b32 s{{[0-9]+}}, 0x45003c00, -2.0
 define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: fsub_constant_sel_constants_v2f16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, 0x45003c00, -2.0
+; GCN-NEXT:    v_mov_b32_e32 v1, s2
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, <2 x half> <half -2.0, half -3.0>, <2 x half> <half -1.0, half 4.0>
  %bo = fsub <2 x half> <half -1.0, half 2.0>, %sel
  store <2 x half> %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}fsub_constant_sel_constants_v4f32:
-; GCN:     s_mov_b32 [[T0:s[0-9]+]], 0x41500000
-; GCN:     s_cselect_b32 s[[SEL0:[0-9]+]], [[T0]], 0x40c00000
-; GCN:     s_cselect_b32 s[[SEL1:[0-9]+]], 0x41100000, 4.0
-; GCN:     s_cselect_b32 s[[SEL2:[0-9]+]], 0x40a00000, 2.0
-; GCN:     s_cselect_b32 s[[SEL3:[0-9]+]], 1.0, 0
-; GCN:     v_mov_b32_e32 v[[V0:[0-9]+]], s[[SEL3]]
-; GCN:     v_mov_b32_e32 v[[V1:[0-9]+]], s[[SEL2]]
-; GCN:     v_mov_b32_e32 v[[V2:[0-9]+]], s[[SEL1]]
-; GCN:     v_mov_b32_e32 v[[V3:[0-9]+]], s[[SEL0]]
-; GCN:     global_store_dwordx4 v{{[0-9]+}}, v[[[V0]]:[[V3]]]
 define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: fsub_constant_sel_constants_v4f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s3, 0x41500000
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b32 s2, s3, 0x40c00000
+; GCN-NEXT:    s_cselect_b32 s3, 0x41100000, 4.0
+; GCN-NEXT:    s_cselect_b32 s4, 0x40a00000, 2.0
+; GCN-NEXT:    s_cselect_b32 s5, 1.0, 0
+; GCN-NEXT:    v_mov_b32_e32 v0, s5
+; GCN-NEXT:    v_mov_b32_e32 v1, s4
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
  %bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel
  store <4 x float> %bo, ptr addrspace(1) %p, align 32
  ret void
 }

-; GCN-LABEL: {{^}}fdiv_constant_sel_constants:
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 4.0, -2.0,
 define amdgpu_kernel void @fdiv_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: fdiv_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 4.0, -2.0, s[2:3]
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, float -4.0, float 2.0
  %bo = fdiv float 8.0, %sel
  store float %bo, ptr addrspace(1) %p, align 4
  ret void
 }

-; GCN-LABEL: {{^}}frem_constant_sel_constants:
-; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 2.0, 1.0,
 define amdgpu_kernel void @frem_constant_sel_constants(ptr addrspace(1) %p, i1 %cond) {
+; GCN-LABEL: frem_constant_sel_constants:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_bitcmp1_b32 s2, 0
+; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GCN-NEXT:    v_cndmask_b32_e64 v1, 2.0, 1.0, s[2:3]
+; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
+; GCN-NEXT:    s_endpgm
  %sel = select i1 %cond, float -4.0, float 3.0
  %bo = frem float 5.0, %sel
  store float %bo, ptr addrspace(1) %p, align 4
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s

 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
@ -6,30 +7,40 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fabs.f32(float) #1
 declare double @llvm.fabs.f64(double) #1

-; SI-LABEL: {{^}}test_class_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; SI-NEXT:    s_load_dword s7, s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s7, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_fabs_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; SI-NEXT:    s_load_dword s7, s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], |s7|, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %a.fabs = call float @llvm.fabs.f32(float %a) #1
  %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
  %sext = sext i1 %result to i32
@ -37,15 +48,20 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32],
  ret void
 }

-; SI-LABEL: {{^}}test_class_fneg_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_fneg_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; SI-NEXT:    s_load_dword s7, s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], -s7, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %a.fneg = fsub float -0.0, %a
  %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
  %sext = sext i1 %result to i32
@ -53,15 +69,20 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32],
  ret void
 }

-; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
-; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
-; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_fneg_fabs_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x1c
+; SI-NEXT:    s_load_dword s7, s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], -|s7|, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %a.fabs = call float @llvm.fabs.f32(float %a) #1
  %a.fneg.fabs = fsub float -0.0, %a.fabs
  %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@ -70,26 +91,36 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x
  ret void
 }

-; SI-LABEL: {{^}}test_class_1_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 1{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 {
+; SI-LABEL: test_class_1_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], s6, 1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_64_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], [[SA]], 64{{$}}
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 {
+; SI-LABEL: test_class_64_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], s6, 64
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
@ -97,42 +128,62 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0
 }

 ; Set all 10 bits of mask
-; SI-LABEL: {{^}}test_class_full_mask_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x3ff{{$}}
-; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 {
+; SI-LABEL: test_class_full_mask_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3ff
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_9bit_mask_f32:
-; SI: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 {
+; SI-LABEL: test_class_9bit_mask_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s4, s[4:5], 0xb
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x1ff
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}v_test_class_full_mask_f32:
-; SI-DAG: buffer_load_dword [[VA:v[0-9]+]]
-; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f32_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: v_test_class_full_mask_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_movk_i32 s4, 0x1ff
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], v2, s4
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -144,13 +195,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr
  ret void
 }

-; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f32:
-; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
-; SI: v_cmp_class_f32_e32 vcc, 1.0, [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, 1.0, v2
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -163,14 +224,24 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr a
 }

 ; FIXME: Why isn't this using a literal constant operand?
-; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f32:
-; SI-DAG: buffer_load_dword [[VB:v[0-9]+]]
-; SI-DAG: s_mov_b32 [[VK:s[0-9]+]], 0x44800000
-; SI: v_cmp_class_f32_e32 vcc, [[VK]], [[VB]]
-; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_class_lit_constant_dynamic_mask_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s4, 0x44800000
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e32 vcc, s4, v2
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -182,30 +253,40 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspac
  ret void
 }

-; SI-LABEL: {{^}}test_class_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_class_f64_e32 vcc, s[6:7], v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_fabs_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_class_f64_e64 s[4:5], |s[6:7]|, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %a.fabs = call double @llvm.fabs.f64(double %a) #1
  %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
  %sext = sext i1 %result to i32
@ -213,15 +294,20 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32],
  ret void
 }

-; SI-LABEL: {{^}}test_class_fneg_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_fneg_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_class_f64_e64 s[4:5], -s[6:7], v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %a.fneg = fsub double -0.0, %a
  %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
  %sext = sext i1 %result to i32
@ -229,15 +315,20 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32],
  ret void
 }

-; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
-; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
-; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
+; SI-LABEL: test_class_fneg_fabs_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s8, s[4:5], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_cmp_class_f64_e64 s[4:5], -|s[6:7]|, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %a.fabs = call double @llvm.fabs.f64(double %a) #1
  %a.fneg.fabs = fsub double -0.0, %a.fabs
  %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@ -246,20 +337,38 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x
  ret void
 }

-; SI-LABEL: {{^}}test_class_1_f64:
-; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 {
+; SI-LABEL: test_class_1_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[2:3], 1
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_64_f64:
-; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #0 {
+; SI-LABEL: test_class_64_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_cmp_class_f64_e64 s[0:1], s[2:3], 64
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
@ -267,30 +376,45 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #
 }

 ; Set all 9 bits of mask
-; SI-LABEL: {{^}}test_class_full_mask_f64:
-; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
-; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
-; SI-NOT: vcc
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
-; SI-NEXT: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 {
+; SI-LABEL: test_class_full_mask_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x13
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x1ff
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}v_test_class_full_mask_f64:
-; SI-DAG: buffer_load_dwordx2 [[VA:v\[[0-9]+:[0-9]+\]]]
-; SI-DAG: s_movk_i32 [[MASK:s[0-9]+]], 0x1ff{{$}}
-; SI: v_cmp_class_f64_e64 s[{{[0-9]}}:{{[0-9]}}], [[VA]], [[MASK]]
-; SI-NOT: vcc
-; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, s[{{[0-9]}}:{{[0-9]}}]
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: v_test_class_full_mask_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_mov_b32 s5, s3
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_movk_i32 s4, 0x1ff
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], s4
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -302,11 +426,23 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr
  ret void
 }

-; SI-LABEL: {{^}}test_class_inline_imm_constant_dynamic_mask_f64:
-; XSI: v_cmp_class_f64_e32 vcc, 1.0,
-; SI: v_cmp_class_f64_e32 vcc,
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f64_e32 vcc, 1.0, v2
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -318,10 +454,25 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr a
  ret void
 }

-; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
-; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_class_lit_constant_dynamic_mask_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b32 s4, 0
+; SI-NEXT:    s_mov_b32 s5, 0x40900000
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f64_e32 vcc, s[4:5], v2
+; SI-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; SI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -333,12 +484,26 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspac
  ret void
 }

-; SI-LABEL: {{^}}test_fold_or_class_f32_0:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
 define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_fold_or_class_f32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 3
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -353,12 +518,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr a
  ret void
 }

-; SI-LABEL: {{^}}test_fold_or3_class_f32_0:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
 define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_fold_or3_class_f32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 7
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -375,13 +554,27 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr
  ret void
 }

-; SI-LABEL: {{^}}test_fold_or_all_tests_class_f32_0:
-; SI-NOT: v_cmp_class
-; SI: s_movk_i32 [[MASK:s[0-9]+]], 0x3ff{{$}}
-; SI: v_cmp_class_f32_e64 s[0:1], v{{[0-9]+}}, [[MASK]]{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
 define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_fold_or_all_tests_class_f32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_movk_i32 s2, 0x3ff
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, s2
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -411,12 +604,26 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %
  ret void
 }

-; SI-LABEL: {{^}}test_fold_or_class_f32_1:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
 define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_fold_or_class_f32_1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 12
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -431,12 +638,26 @@ define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr a
  ret void
 }

-; SI-LABEL: {{^}}test_fold_or_class_f32_2:
-; SI-NOT: v_cmp_class
-; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
-; SI-NOT: v_cmp_class
-; SI: s_endpgm
 define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: test_fold_or_class_f32_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], v0, 7
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -451,12 +672,29 @@ define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr a
  ret void
 }

-; SI-LABEL: {{^}}test_no_fold_or_class_f32_0:
-; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 4{{$}}
-; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
-; SI: s_or_b64
-; SI: s_endpgm
 define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) #0 {
+; SI-LABEL: test_no_fold_or_class_f32_0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dword s12, s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_cmp_class_f32_e64 s[0:1], s12, 8
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[2:3], v0, 4
+; SI-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid
  %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -471,72 +709,94 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt
  ret void
 }

-; SI-LABEL: {{^}}test_class_0_f32:
-; SI-NOT: v_cmp_class
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 {
+; SI-LABEL: test_class_0_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_0_f64:
-; SI-NOT: v_cmp_class
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 {
+; SI-LABEL: test_class_0_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_class_undef_f32:
-; SI-NOT: v_cmp_class
-; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
 define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, i32 %b) #0 {
+; SI-LABEL: test_class_undef_f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
  %result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1
  %sext = sext i1 %result to i32
  store i32 %sext, ptr addrspace(1) %out, align 4
  ret void
 }

-; SI-LABEL: {{^}}test_fold_and_ord:
-; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 32{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
-; SI-NEXT: s_setpc_b64
 define i1 @test_fold_and_ord(float %a) {
+; SI-LABEL: test_fold_and_ord:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 32
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_setpc_b64 s[30:31]
  %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
  %ord = fcmp ord float %a, %a
  %and = and i1 %ord, %class
  ret i1 %and
 }

-; SI-LABEL: {{^}}test_fold_and_unord:
-; SI: s_waitcnt
-; SI-NEXT: v_cmp_class_f32_e64 [[COND:s\[[0-9]+:[0-9]+\]]], v0, 3{{$}}
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]]
-; SI-NEXT: s_setpc_b64
 define i1 @test_fold_and_unord(float %a) {
+; SI-LABEL: test_fold_and_unord:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 3
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    s_setpc_b64 s[30:31]
  %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
  %ord = fcmp uno float %a, %a
  %and = and i1 %ord, %class
  ret i1 %and
 }

-; SI-LABEL: {{^}}test_fold_and_ord_multi_use:
-; SI: v_cmp_class
-; SI-NOT: v_cmp_class
-; SI: v_cmp_o
-; SI: s_and_b64
 define i1 @test_fold_and_ord_multi_use(float %a) {
+; SI-LABEL: test_fold_and_ord_multi_use:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, 35
+; SI-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    s_setpc_b64 s[30:31]
  %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1
  store volatile i1 %class, ptr addrspace(1) poison
  %ord = fcmp ord float %a, %a
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s
@ -5,162 +6,766 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=R600,FUNC %s

-; FUNC-LABEL: {{^}}global_load_f32:
-; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
-; GCN-HSA: flat_load_dword
-
-; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
 define amdgpu_kernel void @global_load_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dword v2, v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    flat_store_dword v[0:1], v2
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load float, ptr addrspace(1) %in
  store float %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v2f32:
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-HSA: flat_load_dwordx2
-
-; R600: VTX_READ_64
 define amdgpu_kernel void @global_load_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v2f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v2f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v2f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <2 x float>, ptr addrspace(1) %in
  store <2 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v3f32:
-; SI-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx3
-; GCNX3-HSA: flat_load_dwordx3
-
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v3f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v3f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:8
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v3f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s1
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    flat_store_dwordx3 v[3:4], v[0:2]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v3f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx3 v[0:2], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <3 x float>, ptr addrspace(1) %in
  store <3 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v4f32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v4f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v4f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v4f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <4 x float>, ptr addrspace(1) %in
  store <4 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v8f32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-
-; R600: VTX_READ_128
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v8f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v8f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v8f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v8f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(1)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <8 x float>, ptr addrspace(1) %in
  store <8 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v9f32:
-; GCN-NOHSA: buffer_load_dword
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dword
-; GCN-HSA: flat_load_dwordx4
-
-; R600: VTX_READ_128
-; R600: VTX_READ_32
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v9f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v9f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v9f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 32
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dword v14, v[6:7]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dword v[10:11], v14
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v9f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dword v8, off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <9 x float>, ptr addrspace(1) %in
  store <9 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-
-; FUNC-LABEL: {{^}}global_load_v10f32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx2
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx2
-
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v10f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v10f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v10f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[0:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx2 v[14:15], v[8:9]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v10f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx2 v[8:9], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <10 x float>, ptr addrspace(1) %in
  store <10 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v11f32:
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; SI-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx4
-; GCNX3-NOHSA: buffer_load_dwordx3
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx3
-
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v11f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v11f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[3:6], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_store_dword v2, off, s[4:7], 0 offset:40
+; SI-NOHSA-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[7:10], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v11f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    flat_load_dwordx3 v[8:10], v[8:9]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[0:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[4:7]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx3 v[15:16], v[8:10]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v11f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx3 v[8:10], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx3 v[8:10], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <11 x float>, ptr addrspace(1) %in
  store <11 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v12f32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v12f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v12f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v12f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s1
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s0
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[4:7]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v12f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(2)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <12 x float>, ptr addrspace(1) %in
  store <12 x float> %tmp0, ptr addrspace(1) %out
  ret void
 }

-; FUNC-LABEL: {{^}}global_load_v16f32:
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
-
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-; GCN-HSA: flat_load_dwordx4
-
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
-; R600: VTX_READ_128
 define amdgpu_kernel void @global_load_v16f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-NOHSA-LABEL: global_load_v16f32:
+; SI-NOHSA:       ; %bb.0: ; %entry
+; SI-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOHSA-NEXT:    s_mov_b32 s6, -1
+; SI-NOHSA-NEXT:    s_mov_b32 s10, s6
+; SI-NOHSA-NEXT:    s_mov_b32 s11, s7
+; SI-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOHSA-NEXT:    s_mov_b32 s4, s0
+; SI-NOHSA-NEXT:    s_mov_b32 s5, s1
+; SI-NOHSA-NEXT:    s_mov_b32 s8, s2
+; SI-NOHSA-NEXT:    s_mov_b32 s9, s3
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; SI-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; SI-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; SI-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; SI-NOHSA-NEXT:    s_endpgm
+;
+; GCN-HSA-LABEL: global_load_v16f32:
+; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    s_add_u32 s6, s2, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[12:13]
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 48
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s1
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s5
+; GCN-HSA-NEXT:    s_add_u32 s0, s0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s4
+; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GCN-HSA-NEXT:    s_nop 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(2)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[12:15]
+; GCN-HSA-NEXT:    s_endpgm
+;
+; GCNX3-NOHSA-LABEL: global_load_v16f32:
+; GCNX3-NOHSA:       ; %bb.0: ; %entry
+; GCNX3-NOHSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s7, 0xf000
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s6, -1
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s10, s6
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s11, s7
+; GCNX3-NOHSA-NEXT:    s_waitcnt lgkmcnt(0)
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s8, s2
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s9, s3
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:32
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0
+; GCNX3-NOHSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s4, s0
+; GCNX3-NOHSA-NEXT:    s_mov_b32 s5, s1
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:48
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GCNX3-NOHSA-NEXT:    s_waitcnt vmcnt(3)
+; GCNX3-NOHSA-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
+; GCNX3-NOHSA-NEXT:    s_endpgm
 entry:
  %tmp0 = load <16 x float>, ptr addrspace(1) %in
  store <16 x float> %tmp0, ptr addrspace(1) %out
@ -168,3 +773,8 @@ entry:
 }

 attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; FUNC: {{.*}}
+; GCN-NOHSA: {{.*}}
+; GCNX3-HSA: {{.*}}
+; R600: {{.*}}
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@ -1,15 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; FIXME: Fails with -enable-var-scope

 ; Make sure 64-bit BFE pattern does a 32-bit BFE on the relevant half.

 ; Extract the high bit of the low half
-; GCN-LABEL: {{^}}v_uextract_bit_31_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_31_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 31, v2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -21,13 +35,24 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(ptr addrspace(1) %out, ptr addr
 }

 ; Extract the high bit of the high half
-; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_63_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 31, v2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -38,12 +63,25 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(ptr addrspace(1) %out, ptr addr
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_1_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_1_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 1, 1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -54,12 +92,25 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(ptr addrspace(1) %out, ptr addrs
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_20_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_20_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 20, 1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -70,13 +121,24 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(ptr addrspace(1) %out, ptr addr
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
-; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_32_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -87,13 +149,24 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(ptr addrspace(1) %out, ptr addr
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_33_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 1, 1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -104,12 +177,25 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(ptr addrspace(1) %out, ptr addr
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_20_21_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_20_21_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 20, 2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -120,12 +206,25 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(ptr addrspace(1) %out, ptr a
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_1_30_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_1_30_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 1, 30
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -136,12 +235,25 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(ptr addrspace(1) %out, ptr ad
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_1_31_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[SHIFT]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_1_31_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 1, v2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -154,13 +266,26 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(ptr addrspace(1) %out, ptr ad

 ; Spans the dword boundary, so requires full shift.
 ; Truncated after the shift, so only low shift result is used.
-; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64:
-; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
-; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
-; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_31_32_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_ashr_i32 s3, s2, 31
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 3
+; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 s11, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[6:7], s[10:11]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 31
+; GCN-NEXT:    v_and_b32_e32 v2, 3, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, 0
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -171,13 +296,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(ptr addrspace(1) %out, ptr a
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
-; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_32_33_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 1, 2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -188,14 +324,24 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(ptr addrspace(1) %out, ptr a
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
-; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 30
-; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_30_60_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v2, v3, v2, 30
+; GCN-NEXT:    v_and_b32_e32 v2, 0x3fffffff, v2
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -206,13 +352,24 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(ptr addrspace(1) %out, ptr a
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
-; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO1]]]
 define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_33_63_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v2, v2, 1, 30
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -223,12 +380,25 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(ptr addrspace(1) %out, ptr a
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
-; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
-; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_31_63_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v0, v3, v2, 31
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -240,11 +410,23 @@ define amdgpu_kernel void @v_uextract_bit_31_63_i64(ptr addrspace(1) %out, ptr a
 }

 ; trunc applied before and mask
-; GCN-LABEL: {{^}}v_uextract_bit_31_i64_trunc_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
-; GCN: buffer_store_dword v[[SHIFT]]
 define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_31_i64_trunc_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v3
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -256,11 +438,23 @@ define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(ptr addrspace(1) %out
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_3_i64_trunc_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
-; GCN: buffer_store_dword [[BFE]]
 define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_3_i64_trunc_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v0, v3, 3, 1
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -272,11 +466,24 @@ define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(ptr addrspace(1) %out,
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_33_i64_trunc_i32:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
-; GCN: buffer_store_dword [[BFE]]
 define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_33_i64_trunc_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v0, v3, 1, 1
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -288,13 +495,24 @@ define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(ptr addrspace(1) %out
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_31_32_i64_trunc_i32:
-; GCN: buffer_load_dwordx2 v[[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]]
-; GCN: v_alignbit_b32 v[[SHRLO:[0-9]+]], v[[VALHI]], v[[VALLO]], 31
-; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
-; GCN-NOT: v[[SHRLO]]
-; GCN: buffer_store_dword v[[SHRLO]]
 define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_31_32_i64_trunc_i32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_alignbit_b32 v0, v4, v3, 31
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v0
+; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %id.x
@ -306,16 +524,24 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(ptr addrspace(1) %
  ret void
 }

-; GCN-LABEL: {{^}}and_not_mask_i64:
-; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
-; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]]
-; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
-; GCN-NOT: v[[SHRLO]]
-; GCN-NOT: v[[SHRHI]]
-; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
 define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: and_not_mask_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v2, 20, v2
+; GCN-NEXT:    v_and_b32_e32 v2, 4, v2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -328,15 +554,29 @@ define amdgpu_kernel void @and_not_mask_i64(ptr addrspace(1) %out, ptr addrspace

 ; The instruction count is the same with/without hasOneUse, but
 ; keeping the 32-bit and has a smaller encoding size than the bfe.
-
-; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
-; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
-; GCN-DAG: v_lshr_b64 v[[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]], [[VAL]], 27
-; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[SHRLO]]:[[SHRHI]]]
-; GCN: buffer_store_dwordx2 v[[[AND]]:[[ZERO]]]
 define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_27_29_multi_use_shift_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[8:11], 0 addr64
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], 27
+; GCN-NEXT:    v_and_b32_e32 v0, 3, v2
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -348,15 +588,30 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(ptr addrspac
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
-; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
-; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
-; GCN-DAG: buffer_store_dwordx2 v[[[SHR]]:[[ZERO_SHR]]]
-; GCN: buffer_store_dwordx2 v[[[BFE]]:[[ZERO_BFE]]]
 define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_34_37_multi_use_shift_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NEXT:    s_mov_b32 s10, 0
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
+; GCN-NEXT:    v_bfe_u32 v2, v2, 2, 3
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dwordx2 v[2:3], off, s[4:7], 0
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %id.x
@ -368,13 +623,32 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(ptr addrspac
  ret void
 }

-; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
-; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v[[[BFE]]:{{[0-9]+\]}}
-; GCN: buffer_store_dword v[[ZERO]]
 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) #1 {
+; GCN-LABEL: v_uextract_bit_33_36_use_upper_half_shift_i64:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
+; GCN-NEXT:    s_mov_b32 s2, 0
+; GCN-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    buffer_load_dword v5, v[1:2], s[4:7], 0 addr64 offset:4
+; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    v_mov_b32_e32 v6, v2
+; GCN-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GCN-NEXT:    v_mov_b32_e32 v4, v2
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_bfe_u32 v5, v5, 1, 3
+; GCN-NEXT:    buffer_store_dwordx2 v[5:6], v[1:2], s[0:3], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    buffer_store_dword v2, v[3:4], s[8:11], 0 addr64
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_endpgm
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %id.x
  %out0.gep = getelementptr i64, ptr addrspace(1) %out0, i32 %id.x
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@ -1,51 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI  %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress < %s | FileCheck -enable-var-scope -check-prefix=EG %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s

 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

 define amdgpu_kernel void @trunc_i64_to_i32_store(ptr addrspace(1) %out, [8 x i32], i64 %in) {
-; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
-; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[4:5],
-; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
-; SI: buffer_store_dword [[VLOAD]]
-; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]]
-
-; EG-LABEL: {{^}}trunc_i64_to_i32_store:
-; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
-; EG: LSHR
-; EG-NEXT: 2(
-
+; SI-LABEL: trunc_i64_to_i32_store:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: trunc_i64_to_i32_store:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: trunc_i64_to_i32_store:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     MOV * T1.X, KC0[4].W,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %result = trunc i64 %in to i32 store i32 %result, ptr addrspace(1) %out, align 4
  ret void
 }

-; GCN-LABEL: {{^}}trunc_load_shl_i64:
-; GCN-DAG: s_load_dwordx2
-; GCN-DAG: s_load_dword [[SREG:s[0-9]+]],
-; GCN: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
-; GCN: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
-; SI: buffer_store_dword [[VSHL]]
-; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
-
 define amdgpu_kernel void @trunc_load_shl_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+; SI-LABEL: trunc_load_shl_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshl_b32 s4, s6, 2
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: trunc_load_shl_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x4c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshl_b32 s2, s2, 2
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: trunc_load_shl_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     LSHL * T1.X, KC0[4].W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %b = shl i64 %a, 2
  %result = trunc i64 %b to i32
  store i32 %result, ptr addrspace(1) %out, align 4
  ret void
 }

-; GCN-LABEL: {{^}}trunc_shl_i64:
-; SI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; VI: s_load_dwordx2 s[[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
-; GCN: s_lshl_b64 s[[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s[[[LO_SREG]]:{{[0-9]+\]}}, 2
-; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
-; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; SI: buffer_store_dword v[[LO_VREG]],
-; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
-; GCN: v_mov_b32_e32
-; GCN: v_mov_b32_e32
 define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1) %out, i64 %a) {
+; SI-LABEL: trunc_shl_i64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b64 s[8:9], s[8:9], 2
+; SI-NEXT:    s_add_u32 s8, s8, 0x3a8
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_mov_b32 s0, s2
+; SI-NEXT:    s_mov_b32 s1, s3
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: trunc_shl_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], s[4:5], 2
+; VI-NEXT:    s_add_u32 s0, s0, 0x3a8
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_addc_u32 s1, s1, 0
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    flat_store_dword v[2:3], v4
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: trunc_shl_i64:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     LSHL * T0.W, KC0[2].W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+; EG-NEXT:     BIT_ALIGN_INT T1.W, KC0[3].X, KC0[2].W, literal.x,
+; EG-NEXT:     ADDC_UINT * T2.W, PV.W, literal.y,
+; EG-NEXT:    30(4.203895e-44), 936(1.311615e-42)
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     ADD_INT T1.Y, PV.W, PS,
+; EG-NEXT:     ADD_INT * T1.X, T0.W, literal.y,
+; EG-NEXT:    2(2.802597e-45), 936(1.311615e-42)
+; EG-NEXT:     LSHR * T2.X, KC0[2].Z, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %aa = add i64 %a, 234 ; Prevent shrinking store.
  %b = shl i64 %aa, 2
  %result = trunc i64 %b to i32
@ -54,9 +153,55 @@ define amdgpu_kernel void @trunc_shl_i64(ptr addrspace(1) %out2, ptr addrspace(1
  ret void
 }

-; GCN-LABEL: {{^}}trunc_i32_to_i1:
-; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
 define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
+; SI-LABEL: trunc_i32_to_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: trunc_i32_to_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_dword v2, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 1, v2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: trunc_i32_to_i1:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 2, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     AND_INT T0.X, T0.X, 1,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %a = load i32, ptr addrspace(1) %ptr, align 4
  %trunc = trunc i32 %a to i1
  %result = select i1 %trunc, i32 1, i32 0
@ -64,9 +209,64 @@ define amdgpu_kernel void @trunc_i32_to_i1(ptr addrspace(1) %out, ptr addrspace(
  ret void
 }

-; GCN-LABEL: {{^}}trunc_i8_to_i1:
-; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
 define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %ptr) {
+; SI-LABEL: trunc_i8_to_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s10, s6
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s8, s2
+; SI-NEXT:    s_mov_b32 s9, s3
+; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: trunc_i8_to_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    flat_load_ubyte v2, v[0:1]
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v2, 1, v2
+; VI-NEXT:    flat_store_byte v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: trunc_i8_to_i1:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, KC0[2].Z,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T0.X, 1,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
+; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
+; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %a = load i8, ptr addrspace(1) %ptr, align 4
  %trunc = trunc i8 %a to i1
  %result = select i1 %trunc, i8 1, i8 0
@ -74,43 +274,213 @@ define amdgpu_kernel void @trunc_i8_to_i1(ptr addrspace(1) %out, ptr addrspace(1
  ret void
 }

-; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 define amdgpu_kernel void @sgpr_trunc_i16_to_i1(ptr addrspace(1) %out, i16 %a) {
+; SI-LABEL: sgpr_trunc_i16_to_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: sgpr_trunc_i16_to_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: sgpr_trunc_i16_to_i1:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     MOV * T0.X, 0.0,
+; EG-NEXT:    ALU clause starting at 9:
+; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T1.W, T0.X, 1,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     LSHL T0.X, T1.W, PV.W,
+; EG-NEXT:     LSHL * T0.W, literal.x, PV.W,
+; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
+; EG-NEXT:     MOV T0.Y, 0.0,
+; EG-NEXT:     MOV * T0.Z, 0.0,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %trunc = trunc i16 %a to i1
  %result = select i1 %trunc, i16 1, i16 0
  store i16 %result, ptr addrspace(1) %out, align 4
  ret void
 }

-; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 define amdgpu_kernel void @sgpr_trunc_i32_to_i1(ptr addrspace(1) %out, i32 %a) {
+; SI-LABEL: sgpr_trunc_i32_to_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s6, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: sgpr_trunc_i32_to_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_and_b32 s2, s2, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: sgpr_trunc_i32_to_i1:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; EG-NEXT:     AND_INT * T1.X, KC0[2].Z, 1,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %trunc = trunc i32 %a to i1
  %result = select i1 %trunc, i32 1, i32 0
  store i32 %result, ptr addrspace(1) %out, align 4
  ret void
 }

-; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
-; SI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; VI: s_load_dwordx2 s[[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c
-; GCN: s_bitcmp1_b32 s[[SLO]], 0
-; GCN: s_cselect_b32 {{s[0-9]+}}, 63, -12
 define amdgpu_kernel void @s_trunc_i64_to_i1(ptr addrspace(1) %out, [8 x i32], i64 %x) {
+; SI-LABEL: s_trunc_i64_to_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitcmp1_b32 s6, 0
+; SI-NEXT:    s_cselect_b32 s4, 63, -12
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: s_trunc_i64_to_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_bitcmp1_b32 s0, 0
+; VI-NEXT:    s_cselect_b32 s0, 63, -12
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: s_trunc_i64_to_i1:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    ALU clause starting at 4:
+; EG-NEXT:     MOV T0.W, literal.x,
+; EG-NEXT:     AND_INT * T1.W, KC0[4].W, 1,
+; EG-NEXT:    63(8.828180e-44), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT T0.X, PS, literal.x, PV.W,
+; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
+; EG-NEXT:    -12(nan), 2(2.802597e-45)
  %trunc = trunc i64 %x to i1
  %sel = select i1 %trunc, i32 63, i32 -12
  store i32 %sel, ptr addrspace(1) %out
  ret void
 }

-; GCN-LABEL: {{^}}v_trunc_i64_to_i1:
-; SI: buffer_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
-; VI: flat_load_dwordx2 v[[[VLO:[0-9]+]]:{{[0-9]+\]}}
-; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
-; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
-; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
 define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_trunc_i64_to_i1:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v0, 1, v3
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, -12, 63, vcc
+; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_trunc_i64_to_i1:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_and_b32_e32 v0, 1, v1
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; VI-NEXT:    v_cndmask_b32_e64 v0, -12, 63, vcc
+; VI-NEXT:    flat_store_dword v[2:3], v0
+; VI-NEXT:    s_endpgm
+;
+; EG-LABEL: v_trunc_i64_to_i1:
+; EG:       ; %bb.0:
+; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    TEX 0 @6
+; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; EG-NEXT:    CF_END
+; EG-NEXT:    PAD
+; EG-NEXT:    Fetch clause starting at 6:
+; EG-NEXT:     VTX_READ_32 T1.X, T1.X, 0, #1
+; EG-NEXT:    ALU clause starting at 8:
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
+; EG-NEXT:    ALU clause starting at 11:
+; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
+; EG-NEXT:     AND_INT T0.W, T1.X, 1, BS:VEC_120/SCL_212
+; EG-NEXT:     MOV * T1.W, literal.y,
+; EG-NEXT:    2(2.802597e-45), 63(8.828180e-44)
+; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, PS,
+; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
+; EG-NEXT:    -12(nan), 0(0.000000e+00)
+; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
+; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
@ -121,3 +491,5 @@ define amdgpu_kernel void @v_trunc_i64_to_i1(ptr addrspace(1) %out, ptr addrspac
  store i32 %sel, ptr addrspace(1) %out.gep
  ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}