AMDGPU: Define agpr versions of ds permute instructions (#156695)

Correctly model these without AV_* operands. This is another
step towards removing the special casing in
TargetInstrInfo::getRegClass. Also add some tests for this.
This commit is contained in:
Matt Arsenault 2025-09-04 15:13:59 +09:00 committed by GitHub
parent 573627fbc7
commit 76cb5fcfb6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 365 additions and 18 deletions

View File

@ -520,6 +520,19 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
let has_gds = 0;
}
multiclass DS_1A1D_PERMUTE_mc <string opName, SDPatternOperator node = null_frag,
RegisterOperand data_op = VGPROp_32> {
assert OperandIsVGPR<data_op>.ret,
"DS with 2 data operands should be declared with VGPRs";
def "" : DS_1A1D_PERMUTE<opName, node, data_op>;
let SubtargetPredicate = isGFX90APlus in {
def _agpr : DS_1A1D_PERMUTE<opName, null_frag,
getEquivalentAGPROperand<data_op>.ret>;
}
}
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, Offset:$offset, (i1 gds))> {
@ -837,10 +850,10 @@ def DS_NOP : DS_VOID<"ds_nop">;
let SubtargetPredicate = isGFX8Plus in {
let Uses = [EXEC] in {
def DS_PERMUTE_B32 : DS_1A1D_PERMUTE <"ds_permute_b32",
int_amdgcn_ds_permute>;
def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
defm DS_PERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_permute_b32",
int_amdgcn_ds_permute>;
defm DS_BPERMUTE_B32 : DS_1A1D_PERMUTE_mc<"ds_bpermute_b32",
int_amdgcn_ds_bpermute>;
}
} // let SubtargetPredicate = isGFX8Plus

View File

@ -0,0 +1,334 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
; Try to stress ds.bpermute and ds.permute instructions with AGPR/AV
; inputs. It's not permissible to mix AGPRs and VGPR data operands.
define void @ds_bpermute_b32_a_a__use_a(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_a_a__use_a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=a"()
%op1 = call i32 asm "; def $0", "=a"()
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "a"(i32 %bpermute)
ret void
}
define void @ds_bpermute_b32_v_a__use_a(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_v_a__use_a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=v"()
%op1 = call i32 asm "; def $0", "=a"()
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "a"(i32 %bpermute)
ret void
}
define void @ds_bpermute_b32_a_v__use_a(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_a_v__use_a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0
; CHECK-NEXT: ds_bpermute_b32 v0, v1, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=a"()
%op1 = call i32 asm "; def $0", "=v"()
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "a"(i32 %bpermute)
ret void
}
define void @ds_bpermute_b32_a_a__use_v(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_a_a__use_v:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=a"()
%op1 = call i32 asm "; def $0", "=a"()
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "v"(i32 %bpermute)
ret void
}
define void @ds_bpermute_b32_v_v__use_a(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_v_v__use_a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=v"()
%op1 = call i32 asm "; def $0", "=v"()
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "a"(i32 %bpermute)
ret void
}
define void @ds_bpermute_b32_av_av__use_av(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_av_av__use_av:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=^VA"()
%op1 = call i32 asm "; def $0", "=^VA"()
%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "^VA"(i32 %bpermute)
ret void
}
define i32 @ds_bpermute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_bpermute_b32_av_av_no_vgprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_bpermute_b32 v0, v0, v1
; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
%gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
%op0 = call i32 asm sideeffect "; def $0", "=^VA"()
%op1 = call i32 asm sideeffect "; def $0", "=^VA"()
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
%permute = call i32 @llvm.amdgcn.ds.bpermute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
ret i32 %permute
}
define void @ds_permute_b32_a_a__use_a(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_permute_b32_a_a__use_a:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_permute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a0, v0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=a"()
%op1 = call i32 asm "; def $0", "=a"()
%permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "a"(i32 %permute)
ret void
}
define void @ds_permute_b32_av_av__use_av(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_permute_b32_av_av__use_av:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ds_permute_b32 v0, v0, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: s_setpc_b64 s[30:31]
%op0 = call i32 asm "; def $0", "=^VA"()
%op1 = call i32 asm "; def $0", "=^VA"()
%permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "^VA"(i32 %permute)
ret void
}
define i32 @ds_permute_b32_av_av_no_vgprs(ptr addrspace(3) %lds) #0 {
; CHECK-LABEL: ds_permute_b32_av_av_no_vgprs:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_accvgpr_write_b32 a2, v40 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a3, v41 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a4, v42 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a5, v43 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a6, v44 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a7, v45 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a8, v46 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a9, v47 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a10, v56 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a11, v57 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a12, v58 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a13, v59 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a14, v60 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a15, v61 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a16, v62 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_write_b32 a17, v63 ; Reload Reuse
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a0
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def a1
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def v[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use v[0:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_accvgpr_read_b32 v0, a0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a1
; CHECK-NEXT: ds_permute_b32 v0, v0, v1
; CHECK-NEXT: v_accvgpr_read_b32 v63, a17 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v62, a16 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v61, a15 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v60, a14 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v59, a13 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v58, a12 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v57, a11 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v56, a10 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v47, a9 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v46, a8 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v45, a7 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v44, a6 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v43, a5 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v42, a4 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v41, a3 ; Reload Reuse
; CHECK-NEXT: v_accvgpr_read_b32 v40, a2 ; Reload Reuse
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%gep.0 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 10
%gep.1 = getelementptr inbounds [512 x i32], ptr addrspace(3) %lds, i32 0, i32 24
%op0 = call i32 asm sideeffect "; def $0", "=^VA"()
%op1 = call i32 asm sideeffect "; def $0", "=^VA"()
%vgpr.def = call { <32 x i32>, <32 x i32> } asm sideeffect "; def $0", "=${v[0:31]},=${v[32:63]}"()
%vgpr.0 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 0
%vgpr.1 = extractvalue { <32 x i32>, <32 x i32> } %vgpr.def, 1
%permute = call i32 @llvm.amdgcn.ds.permute(i32 %op0, i32 %op1)
call void asm sideeffect "; use $0", "{v[0:31]},{v[32:63]}"(<32 x i32> %vgpr.0, <32 x i32> %vgpr.1)
ret i32 %permute
}
attributes #0 = { nounwind "amdgpu-waves-per-eu"="10,10" }

View File

@ -8282,59 +8282,59 @@ ds_swizzle_b32 a5, v1
ds_swizzle_b32 a5, v1 offset:swizzle(BITMASK_PERM,"00p00")
// GFX90A: ds_permute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2 offset:65535
// GFX90A: ds_permute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0x02,0x00,0xff]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a255, v1, a2 offset:65535
// GFX90A: ds_permute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0xff,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v255, a2 offset:65535
// GFX90A: ds_permute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7c,0xda,0x01,0xff,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a255 offset:65535
// GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2
// GFX90A: ds_permute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2
// GFX90A: ds_permute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7c,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_permute_b32 a5, v1, a2 offset:4
// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a2 offset:65535
// GFX90A: ds_bpermute_b32 a255, v1, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0x02,0x00,0xff]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a255, v1, a2 offset:65535
// GFX90A: ds_bpermute_b32 a5, v255, a2 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0xff,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v255, a2 offset:65535
// GFX90A: ds_bpermute_b32 a5, v1, a255 offset:65535 ; encoding: [0xff,0xff,0x7e,0xda,0x01,0xff,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a255 offset:65535
// GFX90A: ds_bpermute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a2
// GFX90A: ds_bpermute_b32 a5, v1, a2 ; encoding: [0x00,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a2
// GFX90A: ds_bpermute_b32 a5, v1, a2 offset:4 ; encoding: [0x04,0x00,0x7e,0xda,0x01,0x02,0x00,0x05]
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: invalid register class: agpr loads and stores not supported on this GPU
// NOT-GFX90A: :[[@LINE+1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
ds_bpermute_b32 a5, v1, a2 offset:4
// GFX90A: ds_add_u64 v1, a[2:3] offset:65535 ; encoding: [0xff,0xff,0x80,0xda,0x01,0x02,0x00,0x00]