
Similar to InstCombinerImpl::freezeOtherUses, attempt to ensure that we merge multiple frozen/unfrozen uses of a SDValue. This fixes a number of hasOneUse() problems when trying to push FREEZE nodes through the DAG. Remove SimplifyMultipleUseDemandedBits handling of FREEZE nodes as we now want to keep the common node, and not bypass for some nodes just because of DemandedElts. Fixes #149799
9055 lines
412 KiB
LLVM
9055 lines
412 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=SI %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-NO-DS128 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-NO-DS128 %s
|
|
; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG %s
|
|
|
|
; Testing for ds_read/write_b128
|
|
; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=VI,VI-DS128 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GFX9,GFX9-DS128 %s
|
|
|
|
define amdgpu_kernel void @local_load_i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_load_i16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_u16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b16 v1, v0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_load_i16:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b16 v1, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_load_i16:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b16 v1, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_load_i16:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 4, @0, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
entry:
|
|
%ld = load i16, ptr addrspace(3) %in
|
|
store i16 %ld, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_load_v2i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_load_v2i16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b32 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b32 v1, v0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_load_v2i16:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b32 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b32 v1, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_load_v2i16:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b32 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b32 v1, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_load_v2i16:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 4, @1, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
entry:
|
|
%ld = load <2 x i16>, ptr addrspace(3) %in
|
|
store <2 x i16> %ld, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_load_v3i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_load_v3i16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b32 v2, v0
|
|
; SI-NEXT: ds_write_b16 v2, v1 offset:4
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_load_v3i16:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b16 v2, v1 offset:4
|
|
; VI-NEXT: ds_write_b32 v2, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_load_v3i16:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b16 v2, v1 offset:4
|
|
; GFX9-NEXT: ds_write_b32 v2, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_load_v3i16:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 19, @2, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.Z, OQAP,
|
|
; EG-NEXT: LSHL T0.Z, PV.Z, literal.x,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.y,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: OR_INT T0.W, T0.Z, T0.W,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_SHORT_WRITE * T0.W, T0.Y,
|
|
; EG-NEXT: RETURN
|
|
entry:
|
|
%ld = load <3 x i16>, ptr addrspace(3) %in
|
|
store <3 x i16> %ld, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_load_v4i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_load_v4i16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_load_v4i16:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_load_v4i16:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_load_v4i16:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 11, @3, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
entry:
|
|
%ld = load <4 x i16>, ptr addrspace(3) %in
|
|
store <4 x i16> %ld, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_load_v8i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_load_v8i16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_load_v8i16:
|
|
; VI-NO-DS128: ; %bb.0: ; %entry
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_load_v8i16:
|
|
; GFX9-NO-DS128: ; %bb.0: ; %entry
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_load_v8i16:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 25, @4, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_load_v8i16:
|
|
; VI-DS128: ; %bb.0: ; %entry
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_load_v8i16:
|
|
; GFX9-DS128: ; %bb.0: ; %entry
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
entry:
|
|
%ld = load <8 x i16>, ptr addrspace(3) %in
|
|
store <8 x i16> %ld, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_load_v16i16(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_load_v16i16:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v4, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v8, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_load_v16i16:
|
|
; VI-NO-DS128: ; %bb.0: ; %entry
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_load_v16i16:
|
|
; GFX9-NO-DS128: ; %bb.0: ; %entry
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[4:5], v[6:7] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_load_v16i16:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 53, @5, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_load_v16i16:
|
|
; VI-DS128: ; %bb.0: ; %entry
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:16
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v4
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_load_v16i16:
|
|
; GFX9-DS128: ; %bb.0: ; %entry
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:16
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
entry:
|
|
%ld = load <16 x i16>, ptr addrspace(3) %in
|
|
store <16 x i16> %ld, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_i16_to_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_u16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b32 v1, v0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_zextload_i16_to_i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b32 v1, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_zextload_i16_to_i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b32 v1, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_i16_to_i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 4, @6, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
%a = load i16, ptr addrspace(3) %in
|
|
%ext = zext i16 %a to i32
|
|
store i32 %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_i16_to_i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_i16_to_i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_i16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b32 v1, v0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_sextload_i16_to_i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_i16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b32 v1, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_sextload_i16_to_i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_i16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b32 v1, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_i16_to_i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 6, @7, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.X, OQAP,
|
|
; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%a = load i16, ptr addrspace(3) %in
|
|
%ext = sext i16 %a to i32
|
|
store i32 %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v1i16_to_v1i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_u16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b32 v1, v0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_zextload_v1i16_to_v1i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b32 v1, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_zextload_v1i16_to_v1i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b32 v1, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v1i16_to_v1i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 4, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
%load = load <1 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <1 x i16> %load to <1 x i32>
|
|
store <1 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v1i16_to_v1i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_i16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b32 v1, v0
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_sextload_v1i16_to_v1i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_i16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v1, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: ds_write_b32 v1, v0
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_sextload_v1i16_to_v1i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_i16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: ds_write_b32 v1, v0
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v1i16_to_v1i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.X, OQAP,
|
|
; EG-NEXT: BFE_INT T0.W, PV.X, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%load = load <1 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <1 x i16> %load to <1 x i32>
|
|
store <1 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v2i16_to_v2i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b32 v0, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_zextload_v2i16_to_v2i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b32 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_zextload_v2i16_to_v2i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b32 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v2i16_to_v2i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 10, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.Y, OQAP,
|
|
; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%load = load <2 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <2 x i16> %load to <2 x i32>
|
|
store <2 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v2i16_to_v2i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b32 v0, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
|
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_sextload_v2i16_to_v2i32:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b32 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
|
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_sextload_v2i16_to_v2i32:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b32 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
|
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v2i16_to_v2i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.Y, OQAP,
|
|
; EG-NEXT: LSHR * T0.W, PV.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%load = load <2 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <2 x i16> %load to <2 x i32>
|
|
store <2 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_local_zextload_v3i16_to_v3i32:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1
|
|
; SI-NEXT: ds_write_b32 v4, v0 offset:8
|
|
; SI-NEXT: ds_write_b64 v4, v[2:3]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_local_zextload_v3i16_to_v3i32:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NEXT: v_mov_b32_e32 v3, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
|
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NEXT: ds_write_b96 v3, v[0:2]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_local_zextload_v3i16_to_v3i32:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
|
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_local_zextload_v3i16_to_v3i32:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 18, @12, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.Z,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.Y,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
entry:
|
|
%ld = load <3 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <3 x i16> %ld to <3 x i32>
|
|
store <3 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_local_sextload_v3i16_to_v3i32:
|
|
; SI: ; %bb.0: ; %entry
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
|
|
; SI-NEXT: v_bfe_i32 v2, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v0, v1, 0, 16
|
|
; SI-NEXT: ds_write_b32 v4, v0 offset:8
|
|
; SI-NEXT: ds_write_b64 v4, v[2:3]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_local_sextload_v3i16_to_v3i32:
|
|
; VI: ; %bb.0: ; %entry
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_b64 v[3:4], v0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3
|
|
; VI-NEXT: v_bfe_i32 v2, v4, 0, 16
|
|
; VI-NEXT: v_bfe_i32 v0, v3, 0, 16
|
|
; VI-NEXT: v_mov_b32_e32 v3, s0
|
|
; VI-NEXT: ds_write_b96 v3, v[0:2]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_local_sextload_v3i16_to_v3i32:
|
|
; GFX9: ; %bb.0: ; %entry
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_b64 v[3:4], v0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v3
|
|
; GFX9-NEXT: v_bfe_i32 v2, v4, 0, 16
|
|
; GFX9-NEXT: v_bfe_i32 v0, v3, 0, 16
|
|
; GFX9-NEXT: v_mov_b32_e32 v3, s0
|
|
; GFX9-NEXT: ds_write_b96 v3, v[0:2]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_local_sextload_v3i16_to_v3i32:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 22, @13, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
entry:
|
|
%ld = load <3 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <3 x i16> %ld to <3 x i32>
|
|
store <3 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_local_zextload_v4i16_to_v4i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_local_zextload_v4i16_to_v4i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 22, @14, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
|
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_local_zextload_v4i16_to_v4i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <4 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <4 x i16> %load to <4 x i32>
|
|
store <4 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v4i16_to_v4i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v1
|
|
; SI-NEXT: v_bfe_i32 v2, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v4, v1, 0, 16
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v1
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v4i16_to_v4i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 25, @15, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T1.Z, PV.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v4i16_to_v4i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b64 v[4:5], v0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v5
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v4
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v5, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v4, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b64 v[4:5], v0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v5
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v4
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v5, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v4, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <4 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <4 x i16> %load to <4 x i32>
|
|
store <4 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v8i16_to_v8i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v12, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
|
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v2
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v2, v[0:1], v[8:9] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v2, v[4:5], v[6:7] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v10, v[0:1], v[8:9] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v10, v[4:5], v[6:7] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v8i16_to_v8i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 46, @16, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: AND_INT T1.W, T0.W, literal.x,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v8i16_to_v8i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
|
; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1
|
|
; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
|
; VI-DS128-NEXT: ds_write_b128 v0, v[8:11] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v0, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v1
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[8:11] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <8 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <8 x i16> %load to <8 x i32>
|
|
store <8 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v8i16_to_v8i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v12, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; SI-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; SI-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[8:9], v[10:11] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v12, v[8:9], v[10:11] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v12, v[4:5], v[6:7] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v8i16_to_v8i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 51, @17, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: LSHR * T1.W, T0.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: LSHR T1.Z, T0.W, literal.x,
|
|
; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T2.Z, T0.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.Z, T1.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T1.W, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T1.W, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v8i16_to_v8i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1
|
|
; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; VI-DS128-NEXT: ds_write_b128 v0, v[8:11] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v0, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v1
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v0
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[8:11] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <8 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <8 x i16> %load to <8 x i32>
|
|
store <8 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v16i16_to_v16i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v4, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
|
|
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v1
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v3
|
|
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v2
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
|
|
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v5
|
|
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v4
|
|
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v7
|
|
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v6
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v5
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[6:7], v[14:15] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[4:5], v[12:13] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[8:9] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v16i16_to_v16i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 94, @18, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: MOV * T3.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v16i16_to_v16i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v4
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v1
|
|
; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v4
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7
|
|
; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6
|
|
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3
|
|
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v2
|
|
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v5
|
|
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v5
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[8:11]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v1
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v0
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v4
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v7
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v7
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v6
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v6
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v3
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v3
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v2
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v2
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v5
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v5
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <16 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <16 x i16> %load to <16 x i32>
|
|
store <16 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v16i16_to_v16i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v4, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v4
|
|
; SI-NEXT: v_bfe_i32 v8, v1, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v10, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v12, v3, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v14, v2, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v6
|
|
; SI-NEXT: v_bfe_i32 v16, v5, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v18, v4, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v0, v7, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v2, v6, 0, 16
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v2
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v7, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v6, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v5
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[18:19], v[16:17] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[14:15], v[12:13] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[10:11], v[8:9] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v16i16_to_v16i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 95, @19, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: LSHR T2.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: LSHR * T3.Z, T2.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T2.W, T2.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T4.Z, T0.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T3.Z, T0.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T4.Z, T0.W, literal.x,
|
|
; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T3.Z, T1.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T4.Z, T1.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T3.Z, T2.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T2.W, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: BFE_INT T2.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: BFE_INT T1.W, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T1.W, T2.Y, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T1.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ALU 7, @20, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v16i16_to_v16i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v4
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4
|
|
; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6
|
|
; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5
|
|
; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[8:11]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v1
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v0
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v3
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v2
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v3, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <16 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <16 x i16> %load to <16 x i32>
|
|
store <16 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v32i16_to_v32i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v12, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
|
|
; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(3)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v2
|
|
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1
|
|
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3
|
|
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(2)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
|
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v5
|
|
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v4
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v6
|
|
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v9
|
|
; SI-NEXT: v_and_b32_e32 v24, 0xffff, v9
|
|
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8
|
|
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
|
|
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v11
|
|
; SI-NEXT: v_and_b32_e32 v26, 0xffff, v11
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v13
|
|
; SI-NEXT: v_and_b32_e32 v28, 0xffff, v13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12
|
|
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12
|
|
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
|
|
; SI-NEXT: v_and_b32_e32 v30, 0xffff, v15
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
|
|
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
|
|
; SI-NEXT: v_mov_b32_e32 v32, s0
|
|
; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
|
|
; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
|
|
; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
|
|
; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
|
|
; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v7
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v6
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v5
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v5
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v4
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v5
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v5
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v7
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v32i16_to_v32i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 105, @21, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.W, OQAP,
|
|
; EG-NEXT: MOV * T4.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.Y, OQAP,
|
|
; EG-NEXT: LSHR T5.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.Y, literal.x,
|
|
; EG-NEXT: MOV * T5.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
|
|
; EG-NEXT: ALU 84, @22, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 72(1.008935e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v32i16_to_v32i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v20, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v20
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16
|
|
; VI-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32
|
|
; VI-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23
|
|
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22
|
|
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21
|
|
; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20
|
|
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20
|
|
; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
|
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7
|
|
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6
|
|
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
|
; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19
|
|
; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18
|
|
; VI-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17
|
|
; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
|
|
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[0:3]
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v20, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v20
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v20 offset:16
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v20 offset:32
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v20 offset:48
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v3
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v2
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v23
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v23
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v22
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v22
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v21
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v21
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v2
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v1
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v7
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v7
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v6
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v6
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v5
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v18
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v18
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v17
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v17
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:96
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:112
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:64
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:80
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3]
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:16
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <32 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <32 x i16> %load to <32 x i32>
|
|
store <32 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v32i16_to_v32i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v12, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:4 offset1:5
|
|
; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:6 offset1:7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(3)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v2
|
|
; SI-NEXT: v_bfe_i32 v16, v1, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v18, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v20, v3, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v22, v2, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(2)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v4
|
|
; SI-NEXT: v_bfe_i32 v0, v5, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v2, v4, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v7
|
|
; SI-NEXT: v_bfe_i32 v4, v7, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v6
|
|
; SI-NEXT: v_bfe_i32 v6, v6, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v9
|
|
; SI-NEXT: v_bfe_i32 v24, v9, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8
|
|
; SI-NEXT: v_bfe_i32 v8, v8, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v11
|
|
; SI-NEXT: v_bfe_i32 v26, v11, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10
|
|
; SI-NEXT: v_bfe_i32 v10, v10, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v13
|
|
; SI-NEXT: v_bfe_i32 v28, v13, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12
|
|
; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15
|
|
; SI-NEXT: v_bfe_i32 v30, v15, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14
|
|
; SI-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; SI-NEXT: v_mov_b32_e32 v32, s0
|
|
; SI-NEXT: ds_write2_b64 v32, v[14:15], v[30:31] offset0:14 offset1:15
|
|
; SI-NEXT: ds_write2_b64 v32, v[12:13], v[28:29] offset0:12 offset1:13
|
|
; SI-NEXT: ds_write2_b64 v32, v[10:11], v[26:27] offset0:10 offset1:11
|
|
; SI-NEXT: ds_write2_b64 v32, v[8:9], v[24:25] offset0:8 offset1:9
|
|
; SI-NEXT: ds_write2_b64 v32, v[6:7], v[4:5] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v32, v[2:3], v[0:1] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v24, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v24, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v5
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v24 offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v24 offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v5
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v30, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v7, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[4:5], v[30:31] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[6:7], v[28:29] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[2:3], v[26:27] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[0:1], v[24:25] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[22:23], v[20:21] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[18:19], v[16:17] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[14:15], v[12:13] offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v32, v[10:11], v[8:9] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v32i16_to_v32i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 101, @23, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.W, OQAP,
|
|
; EG-NEXT: LSHR * T5.W, T4.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T5.Y, OQAP,
|
|
; EG-NEXT: LSHR T5.Z, T4.W, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T6.Z, T0.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T0.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T6.Z, T0.W, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T1.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T6.Z, T1.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T1.W, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR * T6.Z, T2.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 89, @24, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T2.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T6.Z, T2.W, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T3.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T6.Z, T3.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T3.W, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T6.Z, T4.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.Z, T5.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T5.W, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: BFE_INT T5.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: BFE_INT T5.W, T4.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: BFE_INT T4.W, T4.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: BFE_INT T4.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: BFE_INT T4.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T5.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
|
|
; EG-NEXT: ALU 16, @25, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v32i16_to_v32i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v24, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v24
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16
|
|
; VI-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0
|
|
; VI-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4
|
|
; VI-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6
|
|
; VI-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[12:15]
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v24
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:16
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v24 offset:32
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v3
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v2
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v1
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v0
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v10, v3, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v8, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v7
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v6
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v5
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v4
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v18, v5, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v24 offset:48
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v26, 16, v23
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v24, 16, v22
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v30, 16, v21
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v28, 16, v20
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v5
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v4
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v37, v5, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v35, v4, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v25, v23, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v23, v22, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v29, v21, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v27, v20, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v7
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v6
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v33, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v31, v6, 0, 16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[35:38] offset:96
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[31:34] offset:112
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[27:30] offset:64
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[23:26] offset:80
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[16:19] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[12:15]
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[8:11] offset:16
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <32 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <32 x i16> %load to <32 x i32>
|
|
store <32 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v64i16_to_v64i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; SI-NEXT: s_mov_b32 s14, -1
|
|
; SI-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; SI-NEXT: s_add_u32 s12, s12, s11
|
|
; SI-NEXT: s_addc_u32 s13, s13, 0
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v24, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v24 offset0:8 offset1:9
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v24 offset0:10 offset1:11
|
|
; SI-NEXT: ds_read2_b64 v[12:15], v24 offset0:12 offset1:13
|
|
; SI-NEXT: ds_read2_b64 v[8:11], v24 offset0:14 offset1:15
|
|
; SI-NEXT: ds_read2_b64 v[20:23], v24 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[16:19], v24 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[34:37], v24 offset0:4 offset1:5
|
|
; SI-NEXT: ds_read2_b64 v[38:41], v24 offset0:6 offset1:7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(7)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v2
|
|
; SI-NEXT: s_waitcnt lgkmcnt(6)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v33, 16, v5
|
|
; SI-NEXT: v_and_b32_e32 v24, 0xffff, v1
|
|
; SI-NEXT: buffer_store_dword v24, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; SI-NEXT: buffer_store_dword v25, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; SI-NEXT: v_and_b32_e32 v26, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v28, 0xffff, v3
|
|
; SI-NEXT: v_and_b32_e32 v30, 0xffff, v2
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v4
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7
|
|
; SI-NEXT: v_and_b32_e32 v32, 0xffff, v5
|
|
; SI-NEXT: v_and_b32_e32 v24, 0xffff, v4
|
|
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v6
|
|
; SI-NEXT: s_waitcnt lgkmcnt(5)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v13
|
|
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v12
|
|
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v12
|
|
; SI-NEXT: v_lshrrev_b32_e32 v43, 16, v15
|
|
; SI-NEXT: v_and_b32_e32 v42, 0xffff, v15
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
|
|
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v14
|
|
; SI-NEXT: s_waitcnt lgkmcnt(4)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v45, 16, v9
|
|
; SI-NEXT: v_and_b32_e32 v44, 0xffff, v9
|
|
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v8
|
|
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
|
|
; SI-NEXT: v_lshrrev_b32_e32 v47, 16, v11
|
|
; SI-NEXT: v_and_b32_e32 v46, 0xffff, v11
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v10
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v10
|
|
; SI-NEXT: s_waitcnt lgkmcnt(3)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v49, 16, v21
|
|
; SI-NEXT: v_and_b32_e32 v48, 0xffff, v21
|
|
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v20
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v20
|
|
; SI-NEXT: v_lshrrev_b32_e32 v51, 16, v23
|
|
; SI-NEXT: v_and_b32_e32 v50, 0xffff, v23
|
|
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v22
|
|
; SI-NEXT: v_and_b32_e32 v22, 0xffff, v22
|
|
; SI-NEXT: s_waitcnt lgkmcnt(2)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v53, 16, v17
|
|
; SI-NEXT: v_and_b32_e32 v52, 0xffff, v17
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v16
|
|
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v16
|
|
; SI-NEXT: v_lshrrev_b32_e32 v55, 16, v19
|
|
; SI-NEXT: v_and_b32_e32 v54, 0xffff, v19
|
|
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v18
|
|
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v18
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v57, 16, v35
|
|
; SI-NEXT: v_and_b32_e32 v56, 0xffff, v35
|
|
; SI-NEXT: v_lshrrev_b32_e32 v35, 16, v34
|
|
; SI-NEXT: v_and_b32_e32 v34, 0xffff, v34
|
|
; SI-NEXT: v_lshrrev_b32_e32 v59, 16, v37
|
|
; SI-NEXT: v_and_b32_e32 v58, 0xffff, v37
|
|
; SI-NEXT: v_lshrrev_b32_e32 v37, 16, v36
|
|
; SI-NEXT: v_and_b32_e32 v36, 0xffff, v36
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v61, 16, v39
|
|
; SI-NEXT: v_and_b32_e32 v60, 0xffff, v39
|
|
; SI-NEXT: v_lshrrev_b32_e32 v39, 16, v38
|
|
; SI-NEXT: v_and_b32_e32 v38, 0xffff, v38
|
|
; SI-NEXT: v_lshrrev_b32_e32 v63, 16, v41
|
|
; SI-NEXT: v_and_b32_e32 v62, 0xffff, v41
|
|
; SI-NEXT: v_lshrrev_b32_e32 v41, 16, v40
|
|
; SI-NEXT: v_and_b32_e32 v40, 0xffff, v40
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
|
|
; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
|
|
; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
|
|
; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
|
|
; SI-NEXT: ds_write2_b64 v0, v[18:19], v[54:55] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v0, v[16:17], v[52:53] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v0, v[22:23], v[50:51] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v0, v[20:21], v[48:49] offset1:1
|
|
; SI-NEXT: ds_write2_b64 v0, v[10:11], v[46:47] offset0:30 offset1:31
|
|
; SI-NEXT: ds_write2_b64 v0, v[8:9], v[44:45] offset0:28 offset1:29
|
|
; SI-NEXT: ds_write2_b64 v0, v[14:15], v[42:43] offset0:26 offset1:27
|
|
; SI-NEXT: ds_write2_b64 v0, v[12:13], v[6:7] offset0:24 offset1:25
|
|
; SI-NEXT: ds_write2_b64 v0, v[4:5], v[2:3] offset0:22 offset1:23
|
|
; SI-NEXT: ds_write2_b64 v0, v[24:25], v[32:33] offset0:20 offset1:21
|
|
; SI-NEXT: ds_write2_b64 v0, v[30:31], v[28:29] offset0:18 offset1:19
|
|
; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: ds_write2_b64 v0, v[26:27], v[1:2] offset0:16 offset1:17
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v16 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
|
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
|
|
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
|
|
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
|
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v18
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v17
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v20
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v18
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v17
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v20
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v19
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v19
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v22
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v22
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v21
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v27, 0xffff, v21
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v30, 16, v24
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v29, 0xffff, v24
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v23
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v31, 0xffff, v23
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v34, 16, v18
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v18
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v36, 16, v17
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v35, 0xffff, v17
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v20
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v37, 0xffff, v20
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v19
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v19
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[17:20], v16 offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v22
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v22
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v21
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v21
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v24
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v24
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v23
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v23
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v18
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v18
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v17
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v17
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[21:24], v16 offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v56, 16, v19
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v55, 0xffff, v19
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v20
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v20
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v58, 16, v22
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v57, 0xffff, v22
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v19
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v19, 0xffff, v19
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v18
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v18
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, s0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v21
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v21
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v60, 16, v24
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v59, 0xffff, v24
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v24, 16, v23
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v23
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v62, 16, v17
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v61, 0xffff, v17
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[0:1], v[19:20] offset0:30 offset1:31
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[16:17], v[61:62] offset0:28 offset1:29
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[23:24], v[59:60] offset0:26 offset1:27
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[21:22], v[57:58] offset0:24 offset1:25
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[55:56], v[53:54] offset0:22 offset1:23
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[51:52], v[49:50] offset0:20 offset1:21
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[47:48], v[45:46] offset0:18 offset1:19
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[43:44], v[41:42] offset0:16 offset1:17
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[39:40], v[37:38] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[35:36], v[33:34] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[31:32], v[29:30] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[27:28], v[25:26] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[14:15], v[12:13] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[10:11], v[8:9] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[6:7], v[4:5] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
|
|
; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
|
|
; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v18, v[2:3], v[0:1] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v64i16_to_v64i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v56, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v56 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v56 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
|
|
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v11
|
|
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GFX9-NO-DS128-NEXT: s_nop 0
|
|
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[18:21], v56 offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[22:25], v56 offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v13
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v12
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v15
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v10
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v13
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v14
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v17
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v15
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v14
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v16
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v16
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v19
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v19
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v18
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v18
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v31, 16, v21
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v21
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v20
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v20
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v23
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v23
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v37, 16, v22
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v22
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v39, 16, v25
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v38, 0xffff, v25
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v17
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v40, 0xffff, v17
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v43, 16, v16
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v45, 16, v19
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v44, 0xffff, v19
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v18
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v46, 0xffff, v18
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v49, 16, v21
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v21
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v51, 16, v20
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v50, 0xffff, v20
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v53, 16, v23
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[16:19], v56 offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v52, 0xffff, v23
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v55, 16, v22
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v54, 0xffff, v22
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v56 offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v24
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v57, 16, v17
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v56, 0xffff, v17
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v63, 16, v23
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v62, 0xffff, v23
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v22
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v22
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v16
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v19
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v19
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v18
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v18
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v21
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v21
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v20
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v20
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[22:23], v[62:63] offset0:30 offset1:31
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[20:21], v[60:61] offset0:28 offset1:29
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[18:19], v[58:59] offset0:26 offset1:27
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[16:17], v[56:57] offset0:24 offset1:25
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[54:55], v[52:53] offset0:22 offset1:23
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[50:51], v[48:49] offset0:20 offset1:21
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[46:47], v[44:45] offset0:18 offset1:19
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[42:43], v[40:41] offset0:16 offset1:17
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[24:25], v[38:39] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[36:37], v[34:35] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[32:33], v[30:31] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[28:29], v[26:27] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[14:15], v[12:13] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[10:11], v[8:9] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[6:7], v[4:5] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: buffer_load_dword v4, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GFX9-NO-DS128-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v64i16_to_v64i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 116, @26, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T6.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T6.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T6.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T7.W
|
|
; EG-NEXT: MOV T7.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T7.W
|
|
; EG-NEXT: MOV T7.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T7.W
|
|
; EG-NEXT: MOV T7.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T8.W
|
|
; EG-NEXT: MOV T8.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T8.W
|
|
; EG-NEXT: MOV T8.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T8.W
|
|
; EG-NEXT: MOV T8.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T9.W
|
|
; EG-NEXT: MOV T9.Y, OQAP,
|
|
; EG-NEXT: MOV * T9.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T9.W
|
|
; EG-NEXT: MOV T9.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 95, @27, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T9.W
|
|
; EG-NEXT: MOV T9.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T10.W
|
|
; EG-NEXT: MOV T10.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T10.W
|
|
; EG-NEXT: MOV T10.Z, OQAP,
|
|
; EG-NEXT: LSHR T10.W, T10.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: AND_INT T10.W, T10.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T10.W, T10.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: AND_INT T10.W, T10.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T10.W, T9.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: AND_INT T9.W, T9.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: LSHR T9.W, T9.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: AND_INT T9.W, T9.Z, literal.x,
|
|
; EG-NEXT: MOV * T10.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: LSHR T9.W, T9.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: AND_INT T9.W, T9.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: LSHR T9.W, T8.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: AND_INT T8.W, T8.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T8.W,
|
|
; EG-NEXT: LSHR T8.W, T8.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T8.W,
|
|
; EG-NEXT: AND_INT T8.W, T8.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T8.W,
|
|
; EG-NEXT: LSHR T8.W, T8.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T8.W,
|
|
; EG-NEXT: AND_INT T8.W, T8.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T8.W,
|
|
; EG-NEXT: LSHR T8.W, T7.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T8.W,
|
|
; EG-NEXT: AND_INT T7.W, T7.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T8.W, T7.W,
|
|
; EG-NEXT: LSHR T7.W, T7.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
|
|
; EG-NEXT: LDS_WRITE * T8.W, T7.W,
|
|
; EG-NEXT: AND_INT T7.W, T7.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
|
|
; EG-NEXT: LDS_WRITE * T8.W, T7.W,
|
|
; EG-NEXT: LSHR T7.W, T7.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
|
|
; EG-NEXT: LDS_WRITE * T8.W, T7.W,
|
|
; EG-NEXT: AND_INT * T7.W, T7.Y, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 93, @28, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T8.W, T7.W,
|
|
; EG-NEXT: LSHR T7.W, T6.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
|
|
; EG-NEXT: LDS_WRITE * T8.W, T7.W,
|
|
; EG-NEXT: AND_INT T6.W, T6.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: LSHR T6.W, T6.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: AND_INT T6.W, T6.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: LSHR T6.W, T6.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: AND_INT T6.W, T6.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: LSHR T6.W, T5.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: AND_INT T5.W, T5.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.W, T5.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: AND_INT T5.W, T5.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: AND_INT T5.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 152(2.129974e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: LSHR T5.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 136(1.905766e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 184(2.578389e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43)
|
|
; EG-NEXT: ALU 76, @29, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 168(2.354181e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 216(3.026805e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 200(2.802597e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 248(3.475220e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 232(3.251012e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v64i16_to_v64i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; VI-DS128-NEXT: s_mov_b32 s90, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[8:11], v0
|
|
; VI-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
|
|
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
|
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
|
|
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
|
|
; VI-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
|
|
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
|
|
; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
|
|
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16
|
|
; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16
|
|
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v7, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22
|
|
; VI-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23
|
|
; VI-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22
|
|
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:32 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:36 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:40 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:44 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20
|
|
; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21
|
|
; VI-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
|
|
; VI-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
|
|
; VI-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
|
|
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
|
|
; VI-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
|
|
; VI-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
|
|
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
|
|
; VI-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v31, v15
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v25
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v24
|
|
; VI-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v27
|
|
; VI-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v26
|
|
; VI-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
|
|
; VI-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
|
|
; VI-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
|
|
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
|
|
; VI-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
|
|
; VI-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v24, s0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
|
|
; VI-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
|
|
; VI-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
|
|
; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
|
|
; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
|
|
; VI-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v56
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v55
|
|
; VI-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v58
|
|
; VI-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v57
|
|
; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v56
|
|
; VI-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v55
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v27
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v26
|
|
; VI-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v27
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v26
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64
|
|
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:32 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:36 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:40 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:44 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80
|
|
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32
|
|
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[12:15]
|
|
; VI-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v64i16_to_v64i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GFX9-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GFX9-DS128-NEXT: s_mov_b32 s14, -1
|
|
; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v0
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v0 offset:16
|
|
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
|
|
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[20:23], v0 offset:32
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v11
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v19
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v18
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v19
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v18
|
|
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: s_nop 0
|
|
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v17
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v16
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v16
|
|
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: s_nop 0
|
|
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:48
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v23
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v22
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v23
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v22
|
|
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: s_nop 0
|
|
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v21
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v20
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v21
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v16, 0xffff, v20
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v27
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v21, 16, v26
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v25
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v33, 16, v24
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v22, 0xffff, v27
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v0 offset:64
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v26
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v34, 0xffff, v25
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v32, 0xffff, v24
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:80
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[55:58], v0 offset:96
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v29, 16, v10
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v31, v15
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v9
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v27
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v48, 16, v26
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v54, 16, v25
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v52, 16, v24
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v49, 0xffff, v27
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v47, 0xffff, v26
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v53, 0xffff, v25
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v51, 0xffff, v24
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v0 offset:112
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v8
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v11
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v28, 0xffff, v10
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v9
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v25
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v24
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v25
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v24
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, s0
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v8
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v42, 16, v39
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v40, 16, v38
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v46, 16, v37
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v36
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v41, 0xffff, v39
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v38
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v37
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v43, 0xffff, v36
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v61, 16, v58
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v59, 16, v57
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v56
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v55
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v60, 0xffff, v58
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v58, 0xffff, v57
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v56
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v55
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v27
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v26
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v27
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v26
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:224
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[4:7] offset:240
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[8:11] offset:192
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[58:61] offset:208
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[51:54] offset:160
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[47:50] offset:176
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[43:46] offset:128
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[39:42] offset:144
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[32:35] offset:96
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[20:23] offset:112
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[16:19] offset:64
|
|
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:80
|
|
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:32
|
|
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[0:3] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[12:15]
|
|
; GFX9-DS128-NEXT: ds_write_b128 v24, v[28:31] offset:16
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <64 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <64 x i16> %load to <64 x i32>
|
|
store <64 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v64i16_to_v64i32:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; SI-NEXT: s_mov_b32 s14, -1
|
|
; SI-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; SI-NEXT: s_add_u32 s12, s12, s11
|
|
; SI-NEXT: s_addc_u32 s13, s13, 0
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v20, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v20 offset0:8 offset1:9
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v20 offset0:10 offset1:11
|
|
; SI-NEXT: ds_read2_b64 v[8:11], v20 offset0:12 offset1:13
|
|
; SI-NEXT: ds_read2_b64 v[12:15], v20 offset0:14 offset1:15
|
|
; SI-NEXT: ds_read2_b64 v[16:19], v20 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[30:33], v20 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[34:37], v20 offset0:4 offset1:5
|
|
; SI-NEXT: ds_read2_b64 v[38:41], v20 offset0:6 offset1:7
|
|
; SI-NEXT: s_waitcnt lgkmcnt(7)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v23, 16, v4
|
|
; SI-NEXT: v_ashrrev_i32_e32 v25, 16, v7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v27, 16, v6
|
|
; SI-NEXT: s_waitcnt lgkmcnt(6)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v29, 16, v1
|
|
; SI-NEXT: v_bfe_i32 v20, v5, 0, 16
|
|
; SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; SI-NEXT: v_bfe_i32 v22, v4, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v24, v7, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v26, v6, 0, 16
|
|
; SI-NEXT: s_waitcnt expcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v7, 16, v3
|
|
; SI-NEXT: v_bfe_i32 v28, v1, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v20, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v6, v3, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v5, 16, v2
|
|
; SI-NEXT: v_bfe_i32 v4, v2, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(5)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v9
|
|
; SI-NEXT: v_bfe_i32 v2, v9, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 16, v8
|
|
; SI-NEXT: v_bfe_i32 v8, v8, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v43, 16, v11
|
|
; SI-NEXT: v_bfe_i32 v42, v11, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 16, v10
|
|
; SI-NEXT: v_bfe_i32 v10, v10, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(4)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v45, 16, v13
|
|
; SI-NEXT: v_bfe_i32 v44, v13, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 16, v12
|
|
; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v47, 16, v15
|
|
; SI-NEXT: v_bfe_i32 v46, v15, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v14
|
|
; SI-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(3)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v49, 16, v17
|
|
; SI-NEXT: v_bfe_i32 v48, v17, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 16, v16
|
|
; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v51, 16, v19
|
|
; SI-NEXT: v_bfe_i32 v50, v19, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 16, v18
|
|
; SI-NEXT: v_bfe_i32 v18, v18, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(2)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v53, 16, v31
|
|
; SI-NEXT: v_bfe_i32 v52, v31, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v31, 16, v30
|
|
; SI-NEXT: v_bfe_i32 v30, v30, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v55, 16, v33
|
|
; SI-NEXT: v_bfe_i32 v54, v33, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v33, 16, v32
|
|
; SI-NEXT: v_bfe_i32 v32, v32, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v57, 16, v35
|
|
; SI-NEXT: v_bfe_i32 v56, v35, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v35, 16, v34
|
|
; SI-NEXT: v_bfe_i32 v34, v34, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v59, 16, v37
|
|
; SI-NEXT: v_bfe_i32 v58, v37, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v37, 16, v36
|
|
; SI-NEXT: v_bfe_i32 v36, v36, 0, 16
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v61, 16, v39
|
|
; SI-NEXT: v_bfe_i32 v60, v39, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v39, 16, v38
|
|
; SI-NEXT: v_bfe_i32 v38, v38, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v63, 16, v41
|
|
; SI-NEXT: v_bfe_i32 v62, v41, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v41, 16, v40
|
|
; SI-NEXT: v_bfe_i32 v40, v40, 0, 16
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: ds_write2_b64 v0, v[40:41], v[62:63] offset0:14 offset1:15
|
|
; SI-NEXT: ds_write2_b64 v0, v[38:39], v[60:61] offset0:12 offset1:13
|
|
; SI-NEXT: ds_write2_b64 v0, v[36:37], v[58:59] offset0:10 offset1:11
|
|
; SI-NEXT: ds_write2_b64 v0, v[34:35], v[56:57] offset0:8 offset1:9
|
|
; SI-NEXT: ds_write2_b64 v0, v[32:33], v[54:55] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v0, v[30:31], v[52:53] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v0, v[18:19], v[50:51] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v0, v[16:17], v[48:49] offset1:1
|
|
; SI-NEXT: ds_write2_b64 v0, v[14:15], v[46:47] offset0:30 offset1:31
|
|
; SI-NEXT: ds_write2_b64 v0, v[12:13], v[44:45] offset0:28 offset1:29
|
|
; SI-NEXT: ds_write2_b64 v0, v[10:11], v[42:43] offset0:26 offset1:27
|
|
; SI-NEXT: ds_write2_b64 v0, v[8:9], v[2:3] offset0:24 offset1:25
|
|
; SI-NEXT: ds_write2_b64 v0, v[4:5], v[6:7] offset0:22 offset1:23
|
|
; SI-NEXT: ds_write2_b64 v0, v[20:21], v[28:29] offset0:20 offset1:21
|
|
; SI-NEXT: ds_write2_b64 v0, v[26:27], v[24:25] offset0:18 offset1:19
|
|
; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; SI-NEXT: s_waitcnt vmcnt(0)
|
|
; SI-NEXT: ds_write2_b64 v0, v[22:23], v[1:2] offset0:16 offset1:17
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s90, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
|
; VI-NO-DS128-NEXT: s_add_u32 s88, s88, s11
|
|
; VI-NO-DS128-NEXT: s_addc_u32 s89, s89, 0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
|
; VI-NO-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
|
; VI-NO-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
|
|
; VI-NO-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
|
|
; VI-NO-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v64i16_to_v64i32:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s14, -1
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s15, 0xe00000
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v28, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v28 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v28 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_add_u32 s12, s12, s11
|
|
; GFX9-NO-DS128-NEXT: s_addc_u32 s13, s13, 0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v11
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
|
; GFX9-NO-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GFX9-NO-DS128-NEXT: s_nop 0
|
|
; GFX9-NO-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[20:23], v28 offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v10
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v13
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v12
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v30
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v24, v30, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v29
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v26, v29, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v38, 16, v32
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v37, v32, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v40, 16, v31
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v39, v31, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[29:32], v28 offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v42, 16, v34
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v41, v34, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v44, 16, v33
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v43, v33, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v46, 16, v36
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v45, v36, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v48, 16, v35
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v47, v35, 0, 16
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v50, 16, v30
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v49, v30, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v52, 16, v29
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v51, v29, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[33:36], v28 offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v56, 16, v31
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v55, v31, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[28:31], v28 offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v54, 16, v32
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v53, v32, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v15
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v10, 0, 16
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v32, 16, v31
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v31, v31, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v30
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v30, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v30, s0
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v12, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v14
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v17
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v15, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v14, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v17, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v16, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v21
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v21, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v20
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v20, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v23
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v23, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v23, 16, v22
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v22, v22, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v58, 16, v34
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v57, v34, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v33
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v33, v33, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v36
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v59, v36, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v35
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v35, v35, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v29
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v61, v29, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v28
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v28, v28, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[0:1], v[31:32] offset0:30 offset1:31
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[28:29], v[61:62] offset0:28 offset1:29
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[35:36], v[59:60] offset0:26 offset1:27
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[33:34], v[57:58] offset0:24 offset1:25
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[55:56], v[53:54] offset0:22 offset1:23
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[51:52], v[49:50] offset0:20 offset1:21
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[47:48], v[45:46] offset0:18 offset1:19
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[43:44], v[41:42] offset0:16 offset1:17
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[39:40], v[37:38] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[26:27], v[24:25] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[22:23], v[20:21] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[18:19], v[16:17] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[14:15], v[12:13] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[10:11], v[8:9] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[6:7], v[4:5] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GFX9-NO-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v30, v[2:3], v[0:1] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v64i16_to_v64i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 116, @30, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 72(1.008935e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T6.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T6.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T6.W
|
|
; EG-NEXT: MOV T6.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T7.W
|
|
; EG-NEXT: MOV T7.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T7.W
|
|
; EG-NEXT: MOV T7.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 104(1.457350e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T7.W
|
|
; EG-NEXT: MOV T7.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T8.W
|
|
; EG-NEXT: MOV T8.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T8.W
|
|
; EG-NEXT: MOV T8.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T8.W
|
|
; EG-NEXT: MOV T8.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 120(1.681558e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T9.W
|
|
; EG-NEXT: MOV T9.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T9.W
|
|
; EG-NEXT: MOV T9.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 85, @31, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T9.W
|
|
; EG-NEXT: MOV T9.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T10.W
|
|
; EG-NEXT: MOV T10.Y, OQAP,
|
|
; EG-NEXT: LSHR T10.W, T9.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T11.W
|
|
; EG-NEXT: MOV T10.Z, OQAP,
|
|
; EG-NEXT: LSHR * T11.Z, T10.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T10.W, T10.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T0.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T0.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T0.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T1.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T1.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 52(7.286752e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T1.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T2.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 36(5.044674e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T2.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 92(1.289195e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T2.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 84(1.177091e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T3.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 76(1.064987e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T3.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 68(9.528830e-44)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T3.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 124(1.737610e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T4.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 116(1.625506e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T4.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 108(1.513402e-43)
|
|
; EG-NEXT: ALU 83, @32, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T4.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 100(1.401298e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T5.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 156(2.186026e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T5.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 148(2.073922e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T5.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 140(1.961818e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T6.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 132(1.849714e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T6.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 188(2.634441e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T6.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 180(2.522337e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T7.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 172(2.410233e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T7.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 164(2.298129e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T7.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 220(3.082857e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T8.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 212(2.970753e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T8.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 204(2.858649e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T8.W, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 196(2.746545e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T9.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 252(3.531272e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T12.Z, T9.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 244(3.419168e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: LSHR T11.Z, T10.Z, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 236(3.307064e-43)
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: BFE_INT T10.W, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T11.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 228(3.194960e-43)
|
|
; EG-NEXT: ALU 94, @33, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T11.W, T10.W,
|
|
; EG-NEXT: BFE_INT T9.W, T9.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: BFE_INT T9.W, T10.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: BFE_INT T9.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T10.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: BFE_INT T9.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: MOV * T10.W, KC0[2].Y,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T10.W, T9.W,
|
|
; EG-NEXT: BFE_INT T0.W, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T9.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T9.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 80(1.121039e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T2.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T3.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T3.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T4.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T4.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 96(1.345247e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T4.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 144(2.017870e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T5.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 128(1.793662e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T6.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T6.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT * T0.W, T7.Y, 0.0, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 34, @34, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T7.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T7.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T8.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T8.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 192(2.690493e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T8.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T9.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 240(3.363116e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T9.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: BFE_INT T0.W, T10.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 224(3.138909e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v64i16_to_v64i32:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; VI-DS128-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; VI-DS128-NEXT: s_mov_b32 s90, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v32, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[8:11], v32
|
|
; VI-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
|
|
; VI-DS128-NEXT: s_mov_b32 s91, 0xe80000
|
|
; VI-DS128-NEXT: s_add_u32 s88, s88, s11
|
|
; VI-DS128-NEXT: s_addc_u32 s89, s89, 0
|
|
; VI-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16
|
|
; VI-DS128-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16
|
|
; VI-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16
|
|
; VI-DS128-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v4, off, s[88:91], 0 offset:20 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v5, off, s[88:91], 0 offset:24 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: buffer_store_dword v6, off, s[88:91], 0 offset:28 ; 4-byte Folded Spill
|
|
; VI-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
|
|
; VI-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26
|
|
; VI-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36
|
|
; VI-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16
|
|
; VI-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
|
|
; VI-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
|
|
; VI-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37
|
|
; VI-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
|
|
; VI-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v23, v15
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
|
|
; VI-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24
|
|
; VI-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
|
|
; VI-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36
|
|
; VI-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41
|
|
; VI-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56
|
|
; VI-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39
|
|
; VI-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80
|
|
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:20 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:24 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:28 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32
|
|
; VI-DS128-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
|
|
; VI-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[12:15]
|
|
; VI-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v64i16_to_v64i32:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GFX9-DS128-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GFX9-DS128-NEXT: s_mov_b32 s14, -1
|
|
; GFX9-DS128-NEXT: s_mov_b32 s15, 0xe00000
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v32
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[16:19], v32 offset:16
|
|
; GFX9-DS128-NEXT: s_add_u32 s12, s12, s11
|
|
; GFX9-DS128-NEXT: s_addc_u32 s13, s13, 0
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[24:27], v32 offset:32
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v11
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v19
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v18
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v19, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v18, 0, 16
|
|
; GFX9-DS128-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: s_nop 0
|
|
; GFX9-DS128-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v6, 16, v17
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v4, 16, v16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v5, v17, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v3, v16, 0, 16
|
|
; GFX9-DS128-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: s_nop 0
|
|
; GFX9-DS128-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[33:36], v32 offset:48
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[40:43], v32 offset:80
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v19, 16, v27
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 16, v26
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v18, v27, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v16, v26, 0, 16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v27, 16, v36
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v26, v36, 0, 16
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[36:39], v32 offset:64
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[56:59], v32 offset:96
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v53, 16, v40
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v52, v40, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v21, 16, v10
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v47, 16, v39
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v45, 16, v38
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v51, 16, v37
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v46, v39, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v44, v38, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v50, v37, 0, 16
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[37:40], v32 offset:112
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v32, s0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v15
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 16, v9
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 16, v8
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 16, v38
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 16, v37
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v38, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v37, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v22, v11, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v20, v10, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v12, v8, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 16, v25
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 16, v24
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v10, v25, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v8, v24, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v25, 16, v35
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v31, 16, v34
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v29, 16, v33
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v24, v35, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v30, v34, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v28, v33, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v49, 16, v36
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v48, v36, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v36, 16, v43
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v34, 16, v42
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v55, 16, v41
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v35, v43, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v33, v42, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v54, v41, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v62, 16, v59
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v60, 16, v58
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 16, v57
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 16, v56
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v61, v59, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v59, v58, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v57, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v56, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v43, 16, v40
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v41, 16, v39
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v42, v40, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v40, v39, 0, 16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:224
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[40:43] offset:240
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[4:7] offset:192
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[59:62] offset:208
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[52:55] offset:160
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[33:36] offset:176
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[48:51] offset:128
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[44:47] offset:144
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[28:31] offset:96
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[24:27] offset:112
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[8:11] offset:64
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[16:19] offset:80
|
|
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:32
|
|
; GFX9-DS128-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GFX9-DS128-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[0:3] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[12:15]
|
|
; GFX9-DS128-NEXT: ds_write_b128 v32, v[20:23] offset:16
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <64 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <64 x i16> %load to <64 x i32>
|
|
store <64 x i32> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_i16_to_i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_u16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_zextload_i16_to_i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_zextload_i16_to_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_i16_to_i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 8, @35, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: MOV T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%a = load i16, ptr addrspace(3) %in
|
|
%ext = zext i16 %a to i64
|
|
store i64 %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
; FIXME: Need to optimize this sequence to avoid an extra shift.
|
|
; t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
|
|
; t28: i64 = any_extend t25
|
|
; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
|
|
define amdgpu_kernel void @local_sextload_i16_to_i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_i16_to_i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_i16 v0, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_sextload_i16_to_i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_sextload_i16_to_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_i16_to_i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 10, @36, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.X, OQAP,
|
|
; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%a = load i16, ptr addrspace(3) %in
|
|
%ext = sext i16 %a to i64
|
|
store i64 %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v1i16_to_v1i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_u16 v0, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_zextload_v1i16_to_v1i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_zextload_v1i16_to_v1i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v1i16_to_v1i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 8, @37, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: MOV T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 0(0.000000e+00), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%load = load <1 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <1 x i16> %load to <1 x i64>
|
|
store <1 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v1i16_to_v1i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_i16 v0, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; SI-NEXT: v_mov_b32_e32 v2, s0
|
|
; SI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-LABEL: local_sextload_v1i16_to_v1i64:
|
|
; VI: ; %bb.0:
|
|
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NEXT: s_mov_b32 m0, -1
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NEXT: ds_read_u16 v0, v0
|
|
; VI-NEXT: v_mov_b32_e32 v2, s0
|
|
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-NEXT: ds_write_b64 v2, v[0:1]
|
|
; VI-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: local_sextload_v1i16_to_v1i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NEXT: ds_read_u16 v0, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-NEXT: ds_write_b64 v2, v[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v1i16_to_v1i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 10, @38, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_USHORT_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.X, OQAP,
|
|
; EG-NEXT: BFE_INT * T0.W, PV.X, 0.0, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
%load = load <1 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <1 x i16> %load to <1 x i64>
|
|
store <1 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v2i16_to_v2i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b32 v2, v0
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
|
|
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; SI-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read_b32 v0, v0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[2:3], v[0:1] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v2i16_to_v2i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read_b32 v0, v0
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s1, 0xffff
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v2, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v2i16_to_v2i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 17, @39, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.Y, OQAP,
|
|
; EG-NEXT: AND_INT T0.W, PV.Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, literal.y,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v2i16_to_v2i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b32 v0, v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v2i16_to_v2i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b32 v2, v0
|
|
; GFX9-DS128-NEXT: s_mov_b32 s1, 0xffff
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <2 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <2 x i16> %load to <2 x i64>
|
|
store <2 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v2i16_to_v2i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b32 v0, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v2, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read_b32 v0, v0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v2i16_to_v2i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read_b32 v0, v0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v2i16_to_v2i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 18, @40, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.Y, OQAP,
|
|
; EG-NEXT: BFE_INT * T0.W, PV.Y, 0.0, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T1.W, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 4(5.605194e-45)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v2i16_to_v2i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b32 v1, v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v2i16_to_v2i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b32 v1, v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <2 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <2 x i16> %load to <2 x i64>
|
|
store <2 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v4i16_to_v4i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: v_mov_b32_e32 v3, 0
|
|
; SI-NEXT: v_mov_b32_e32 v5, v3
|
|
; SI-NEXT: v_mov_b32_e32 v7, v3
|
|
; SI-NEXT: v_mov_b32_e32 v9, v3
|
|
; SI-NEXT: v_mov_b32_e32 v10, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v0
|
|
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v1
|
|
; SI-NEXT: ds_write2_b64 v10, v[4:5], v[2:3] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v10, v[6:7], v[8:9] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, 0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v2
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v2
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, s0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v2
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v9, v[7:8], v[5:6] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v4i16_to_v4i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v2
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v2
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, s0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v2
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v7, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v5, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[3:4], v[5:6] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v9, v[1:2], v[7:8] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v4i16_to_v4i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 35, @41, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, literal.y,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v4i16_to_v4i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v5, v1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b64 v[7:8], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v9, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v7
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v7
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v8
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v7, v1
|
|
; VI-DS128-NEXT: ds_write_b128 v9, v[0:3] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v9, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v4i16_to_v4i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_mov_b32 s2, 0xffff
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v1
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b64 v[6:7], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v6
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v6, s2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <4 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <4 x i16> %load to <4 x i64>
|
|
store <4 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v4i16_to_v4i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read_b64 v[0:1], v0
|
|
; SI-NEXT: v_mov_b32_e32 v9, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 16, v1
|
|
; SI-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v5, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v7, v4, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
|
|
; SI-NEXT: ds_write2_b64 v9, v[3:4], v[1:2] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v9, v[5:6], v[7:8] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v4i16_to_v4i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[6:7], v[4:5] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v8, v[0:1], v[2:3] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v4i16_to_v4i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 39, @42, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: BFE_INT * T0.W, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T1.Z, PV.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T1.W, PV.W, literal.y,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 20(2.802597e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.Z,
|
|
; EG-NEXT: ASHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v4i16_to_v4i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
|
; VI-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v4i16_to_v4i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b64 v[0:1], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v3, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <4 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <4 x i16> %load to <4 x i64>
|
|
store <4 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v8i16_to_v8i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v5, 0
|
|
; SI-NEXT: v_mov_b32_e32 v7, v5
|
|
; SI-NEXT: v_mov_b32_e32 v9, v5
|
|
; SI-NEXT: v_mov_b32_e32 v11, v5
|
|
; SI-NEXT: v_mov_b32_e32 v13, v5
|
|
; SI-NEXT: v_mov_b32_e32 v15, v5
|
|
; SI-NEXT: v_mov_b32_e32 v17, v5
|
|
; SI-NEXT: v_mov_b32_e32 v19, v5
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v2
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
|
|
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v0
|
|
; SI-NEXT: v_and_b32_e32 v12, 0xffff, v1
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v2
|
|
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v3
|
|
; SI-NEXT: v_mov_b32_e32 v0, s0
|
|
; SI-NEXT: ds_write2_b64 v0, v[8:9], v[6:7] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v0, v[12:13], v[4:5] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v0, v[10:11], v[16:17] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v0, v[14:15], v[18:19] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, 0
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[9:10], v[2:3] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[8:9] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[1:2] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, v3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[0:1] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v8i16_to_v8i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, 0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v12
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v12
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_mov_b32 s1, 0xffff
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v7, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v6, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v12
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[11:12], v[7:8] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v12
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v5, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[2:3], v[6:7] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v12
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_sdwa v4, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v12
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[9:10], v[5:6] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, v12
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v13, v[0:1], v[4:5] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v8i16_to_v8i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 71, @43, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: AND_INT T1.W, T0.W, literal.x,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, literal.y,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v8i16_to_v8i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v14, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v1, 0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2
|
|
; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v11, v1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v13, v1
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, v1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v10, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v3, v1
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v5, v1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v7, v1
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v8i16_to_v8i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v11, 0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v11
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v11
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v11
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; GFX9-DS128-NEXT: s_mov_b32 s1, 0xffff
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v14, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v3
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v12, s1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v9, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:48
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v11
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v6, s1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v2
|
|
; GFX9-DS128-NEXT: v_and_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v11
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v11
|
|
; GFX9-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v11
|
|
; GFX9-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v14, v[4:7]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <8 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <8 x i16> %load to <8 x i64>
|
|
store <8 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v8i16_to_v8i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v16, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v9, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0
|
|
; SI-NEXT: v_ashrrev_i32_e32 v5, 31, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v4, 16, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v6, 16, v3
|
|
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v8, v1, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v10, v9, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; SI-NEXT: v_bfe_i32 v14, v11, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; SI-NEXT: ds_write2_b64 v16, v[10:11], v[6:7] offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v16, v[8:9], v[4:5] offset0:2 offset1:3
|
|
; SI-NEXT: ds_write2_b64 v16, v[2:3], v[14:15] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v16, v[0:1], v[12:13] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v3
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v10, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v8i16_to_v8i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v3
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v7, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[2:3], v[10:11] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[14:15], v[8:9] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[0:1], v[6:7] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v16, v[12:13], v[4:5] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v8i16_to_v8i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 80, @44, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV * T0.W, OQAP,
|
|
; EG-NEXT: BFE_INT T1.W, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: BFE_INT T1.Z, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T2.W, T1.W, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: BFE_INT T2.Z, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T2.W, T1.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: BFE_INT T3.Z, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T2.W, T2.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: ASHR T2.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 52(7.286752e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T1.Z,
|
|
; EG-NEXT: ASHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T2.Z,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T3.Z,
|
|
; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v8i16_to_v8i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v16, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; VI-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
|
|
; VI-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; VI-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-DS128-NEXT: ds_write_b128 v16, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v16, v[8:11] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v16, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v8i16_to_v8i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v16, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v2
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v12, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v3
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v8, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v10, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-DS128-NEXT: ds_write_b128 v16, v[0:3] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v16, v[12:15] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v16, v[8:11] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v16, v[4:7]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <8 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <8 x i16> %load to <8 x i64>
|
|
store <8 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v16i16_to_v16i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v4, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
|
|
; SI-NEXT: v_mov_b32_e32 v9, 0
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v11, v9
|
|
; SI-NEXT: v_mov_b32_e32 v13, v9
|
|
; SI-NEXT: v_mov_b32_e32 v15, v9
|
|
; SI-NEXT: v_mov_b32_e32 v17, v9
|
|
; SI-NEXT: v_mov_b32_e32 v20, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v1
|
|
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v1
|
|
; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:10 offset1:11
|
|
; SI-NEXT: v_mov_b32_e32 v16, v9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v3
|
|
; SI-NEXT: v_and_b32_e32 v14, 0xffff, v3
|
|
; SI-NEXT: ds_write2_b64 v20, v[14:15], v[12:13] offset0:14 offset1:15
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
|
; SI-NEXT: v_and_b32_e32 v15, 0xffff, v7
|
|
; SI-NEXT: ds_write2_b64 v20, v[15:16], v[10:11] offset0:6 offset1:7
|
|
; SI-NEXT: v_and_b32_e32 v7, 0xffff, v4
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v6
|
|
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v0
|
|
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
|
|
; SI-NEXT: v_and_b32_e32 v16, 0xffff, v2
|
|
; SI-NEXT: v_and_b32_e32 v18, 0xffff, v0
|
|
; SI-NEXT: v_mov_b32_e32 v5, v9
|
|
; SI-NEXT: ds_write2_b64 v20, v[4:5], v[8:9] offset0:2 offset1:3
|
|
; SI-NEXT: v_mov_b32_e32 v19, v9
|
|
; SI-NEXT: v_mov_b32_e32 v8, v9
|
|
; SI-NEXT: v_mov_b32_e32 v15, v9
|
|
; SI-NEXT: v_mov_b32_e32 v2, v9
|
|
; SI-NEXT: v_mov_b32_e32 v4, v9
|
|
; SI-NEXT: ds_write2_b64 v20, v[18:19], v[12:13] offset0:8 offset1:9
|
|
; SI-NEXT: ds_write2_b64 v20, v[16:17], v[14:15] offset0:12 offset1:13
|
|
; SI-NEXT: ds_write2_b64 v20, v[10:11], v[1:2] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v20, v[7:8], v[3:4] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, 0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v8
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v8
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[9:10] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[5:6], v[9:10] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v4
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, v8
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[12:13], v[7:8] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[3:4], v[9:10] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v8
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v8
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[2:3], v[6:7] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v8
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v6, v8
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[1:2], v[5:6] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v8
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v12, v8
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v14, v[0:1], v[11:12] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v16i16_to_v16i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, 0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v12, v8
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v8
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v5
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[11:12], v[9:10] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v6
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v5, 0xffff, v7
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[9:10] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v4
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v8
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[7:8] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, v8
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v8
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[9:10] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v8
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v8
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[2:3], v[6:7] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v8
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v6, v8
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[1:2], v[5:6] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v8
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[0:1], v[13:14] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v16i16_to_v16i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 100, @45, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: MOV * T2.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: MOV * T3.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, literal.y,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 42, @46, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v16i16_to_v16i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v26, 0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v22, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v24, v26
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v5, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v5
|
|
; VI-DS128-NEXT: ds_read_b128 v[13:16], v5 offset:16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v11, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v19, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, v26
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v2
|
|
; VI-DS128-NEXT: v_and_b32_e32 v10, 0xffff, v2
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v23, 16, v13
|
|
; VI-DS128-NEXT: v_and_b32_e32 v21, 0xffff, v13
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v27, 16, v14
|
|
; VI-DS128-NEXT: v_and_b32_e32 v25, 0xffff, v14
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v14, s0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v13, v26
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v1
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v16
|
|
; VI-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v16
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[21:24] offset:64
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v21, v26
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[10:13] offset:32
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v10, v26
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v0
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v0
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v15
|
|
; VI-DS128-NEXT: v_and_b32_e32 v15, 0xffff, v15
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[18:21] offset:112
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v16, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v18, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v1, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v3, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v28, v26
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[7:10] offset:16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v5, v26
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v7, v26
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[15:18] offset:96
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[0:3] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[25:28] offset:80
|
|
; VI-DS128-NEXT: ds_write_b128 v14, v[4:7]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v16i16_to_v16i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v25, 0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v25
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v4
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v4 offset:16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v28, s0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v25
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v6
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[20:23] offset:112
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v25
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v2
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[17:20] offset:96
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v25
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v1
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v1
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[14:17] offset:32
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v25
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v0
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v0
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v3
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v26, 16, v5
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v24, 0xffff, v5
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v25
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[11:14] offset:16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v9, v25
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v11, v25
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[4:7] offset:64
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[0:3] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[24:27] offset:80
|
|
; GFX9-DS128-NEXT: ds_write_b128 v28, v[8:11]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <16 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <16 x i16> %load to <16 x i64>
|
|
store <16 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v16i16_to_v16i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v4, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v4 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v4 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v18, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_mov_b32_e32 v12, v3
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v2
|
|
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v14, v7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
|
|
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v4
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v8, 16, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v3
|
|
; SI-NEXT: v_bfe_i32 v12, v12, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:14 offset1:15
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v1
|
|
; SI-NEXT: v_bfe_i32 v12, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:10 offset1:11
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v10, 16, v7
|
|
; SI-NEXT: v_bfe_i32 v12, v14, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; SI-NEXT: ds_write2_b64 v18, v[12:13], v[10:11] offset0:6 offset1:7
|
|
; SI-NEXT: v_bfe_i32 v1, v4, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v3, v5, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v5, v6, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v10, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v7, v2, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v12, v19, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; SI-NEXT: v_bfe_i32 v14, v17, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; SI-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; SI-NEXT: ds_write2_b64 v18, v[3:4], v[8:9] offset0:2 offset1:3
|
|
; SI-NEXT: v_bfe_i32 v3, v15, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; SI-NEXT: ds_write2_b64 v18, v[7:8], v[3:4] offset0:12 offset1:13
|
|
; SI-NEXT: ds_write2_b64 v18, v[10:11], v[16:17] offset0:8 offset1:9
|
|
; SI-NEXT: ds_write2_b64 v18, v[5:6], v[14:15] offset0:4 offset1:5
|
|
; SI-NEXT: ds_write2_b64 v18, v[1:2], v[12:13] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v16, v7
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v18, v3
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v16i16_to_v16i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v4 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v3
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[4:5], v[14:15] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v16, v7
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v4, v18, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v18, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[16:17], v[14:15] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v9, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v11, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v6, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[14:15], v[4:5] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[2:3], v[12:13] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[6:7], v[10:11] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v19, v[0:1], v[8:9] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v16i16_to_v16i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 101, @47, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: BFE_INT T2.W, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV * T2.Z, OQAP,
|
|
; EG-NEXT: BFE_INT T3.Z, T2.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T2.W, literal.y,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: BFE_INT T4.Z, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T3.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: BFE_INT T5.Z, T0.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T4.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: BFE_INT T6.Z, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T5.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T6.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T7.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: BFE_INT T9.Z, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T3.W, T8.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: ASHR T3.W, T9.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 116(1.625506e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: ASHR T3.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: ASHR T1.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T1.W,
|
|
; EG-NEXT: MOV * T1.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T1.W, T2.W,
|
|
; EG-NEXT: ASHR T1.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T1.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T3.Z,
|
|
; EG-NEXT: ASHR T1.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44)
|
|
; EG-NEXT: ALU 62, @48, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T1.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T4.Z,
|
|
; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T1.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T5.Z,
|
|
; EG-NEXT: ASHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: ASHR T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T6.Z,
|
|
; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T7.Z,
|
|
; EG-NEXT: ASHR T0.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T8.Z,
|
|
; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T9.Z,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v16i16_to_v16i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[3:6], v0
|
|
; VI-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v18, v6
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v8, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v15, v10
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112
|
|
; VI-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; VI-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96
|
|
; VI-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v16i16_to_v16i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v0
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v0 offset:16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v0, v3, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v8
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v11, v8, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v3, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:80
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v11, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v10
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v10
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:64
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v11, v15, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v9
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[11:14] offset:112
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v9, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v6
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v6
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v10, v4, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v5
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:96
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v18, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v16, v19, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v12, v4, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[14:17] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[10:13] offset:16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v8, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <16 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <16 x i16> %load to <16 x i64>
|
|
store <16 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_zextload_v32i16_to_v32i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[2:5], v0 offset0:2 offset1:3
|
|
; SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; SI-NEXT: ds_read2_b64 v[6:9], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v19, v1
|
|
; SI-NEXT: v_mov_b32_e32 v21, v1
|
|
; SI-NEXT: v_mov_b32_e32 v22, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(1)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v5
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v5
|
|
; SI-NEXT: ds_read2_b64 v[10:13], v0 offset0:4 offset1:5
|
|
; SI-NEXT: ds_read2_b64 v[14:17], v0 offset0:6 offset1:7
|
|
; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:14 offset1:15
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v3
|
|
; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:10 offset1:11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(4)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v9
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v9
|
|
; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:6 offset1:7
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v7
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v7
|
|
; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:2 offset1:3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(4)
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v17
|
|
; SI-NEXT: v_and_b32_e32 v20, 0xffff, v17
|
|
; SI-NEXT: ds_write2_b64 v22, v[20:21], v[18:19] offset0:30 offset1:31
|
|
; SI-NEXT: v_mov_b32_e32 v18, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v15
|
|
; SI-NEXT: v_mov_b32_e32 v20, v1
|
|
; SI-NEXT: v_and_b32_e32 v19, 0xffff, v15
|
|
; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:26 offset1:27
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v13
|
|
; SI-NEXT: v_and_b32_e32 v19, 0xffff, v13
|
|
; SI-NEXT: ds_write2_b64 v22, v[19:20], v[17:18] offset0:22 offset1:23
|
|
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
|
|
; SI-NEXT: v_mov_b32_e32 v5, v1
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; SI-NEXT: ds_write2_b64 v22, v[4:5], v[17:18] offset0:12 offset1:13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; SI-NEXT: v_and_b32_e32 v17, 0xffff, v2
|
|
; SI-NEXT: v_mov_b32_e32 v4, v1
|
|
; SI-NEXT: ds_write2_b64 v22, v[17:18], v[3:4] offset0:8 offset1:9
|
|
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
|
|
; SI-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; SI-NEXT: v_and_b32_e32 v8, 0xffff, v8
|
|
; SI-NEXT: v_mov_b32_e32 v9, v1
|
|
; SI-NEXT: v_mov_b32_e32 v7, v1
|
|
; SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
|
|
; SI-NEXT: ds_write2_b64 v22, v[8:9], v[2:3] offset0:4 offset1:5
|
|
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v12
|
|
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v10
|
|
; SI-NEXT: ds_write2_b64 v22, v[6:7], v[4:5] offset1:1
|
|
; SI-NEXT: v_and_b32_e32 v4, 0xffff, v10
|
|
; SI-NEXT: v_and_b32_e32 v5, 0xffff, v11
|
|
; SI-NEXT: v_and_b32_e32 v10, 0xffff, v12
|
|
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v16
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v14
|
|
; SI-NEXT: v_and_b32_e32 v17, 0xffff, v14
|
|
; SI-NEXT: v_and_b32_e32 v19, 0xffff, v16
|
|
; SI-NEXT: v_mov_b32_e32 v6, v1
|
|
; SI-NEXT: ds_write2_b64 v22, v[5:6], v[0:1] offset0:18 offset1:19
|
|
; SI-NEXT: v_mov_b32_e32 v11, v1
|
|
; SI-NEXT: v_mov_b32_e32 v5, v1
|
|
; SI-NEXT: v_mov_b32_e32 v13, v1
|
|
; SI-NEXT: v_mov_b32_e32 v16, v1
|
|
; SI-NEXT: ds_write2_b64 v22, v[19:20], v[12:13] offset0:28 offset1:29
|
|
; SI-NEXT: ds_write2_b64 v22, v[17:18], v[15:16] offset0:24 offset1:25
|
|
; SI-NEXT: ds_write2_b64 v22, v[10:11], v[2:3] offset0:20 offset1:21
|
|
; SI-NEXT: ds_write2_b64 v22, v[4:5], v[8:9] offset0:16 offset1:17
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v5, 0
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v19, v5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v21, v5
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v22, s0
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset1:1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(5)
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v8, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(8)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v13
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v13
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v12
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v12
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(10)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v14
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v14
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v14, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v10
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v10
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v17
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v17
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v13, v5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v16
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v10, v5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v3, v5
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v15
|
|
; VI-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v2, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v9, v5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v1, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_zextload_v32i16_to_v32i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v5, 0
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v19, v5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v21, v5
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[6:9], v4 offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v4 offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v22, s0
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[10:13], v4 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[14:17], v4 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(2)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v2
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[20:21] offset0:28 offset1:29
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v18, 0xffff, v1
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[18:19], v[1:2] offset0:26 offset1:27
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v1, 0xffff, v0
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[1:2], v[18:19] offset0:24 offset1:25
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v9
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v9
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[18:19] offset0:22 offset1:23
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v8
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v8, 16, v8
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[8:9] offset0:20 offset1:21
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v7
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[0:1], v[7:8] offset0:18 offset1:19
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v7, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:16 offset1:17
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(7)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v17
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v17
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v16
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v15
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v15
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[0:1] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v14
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v14
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v8, 0xffff, v10
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v9, 0xffff, v12
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v10, 16, v13
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v12, 0xffff, v13
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v3
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v14, v5
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
|
; GFX9-NO-DS128-NEXT: v_and_b32_e32 v6, 0xffff, v11
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[4:5], v[13:14] offset0:30 offset1:31
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v13, v5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v11, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[12:13], v[10:11] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v10, v5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v3, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[9:10], v[2:3] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v2, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[6:7], v[1:2] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v9, v5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v1, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v22, v[8:9], v[0:1] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_zextload_v32i16_to_v32i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 105, @49, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 56(7.847271e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T1.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T2.W
|
|
; EG-NEXT: MOV T2.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T3.W
|
|
; EG-NEXT: MOV T3.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T4.W
|
|
; EG-NEXT: MOV T4.W, OQAP,
|
|
; EG-NEXT: MOV * T5.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV T5.Y, OQAP,
|
|
; EG-NEXT: LSHR T5.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T5.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T5.Y, literal.x,
|
|
; EG-NEXT: MOV * T5.W, KC0[2].Y,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T4.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: LSHR T4.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 88(1.233143e-43)
|
|
; EG-NEXT: LDS_WRITE * T5.W, T4.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
|
|
; EG-NEXT: ALU 93, @50, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T3.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: LSHR T3.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T4.W, T3.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T2.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: LSHR T2.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43)
|
|
; EG-NEXT: LDS_WRITE * T3.W, T2.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T1.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: LSHR T1.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 200(2.802597e-43)
|
|
; EG-NEXT: LDS_WRITE * T2.W, T1.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: LSHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: AND_INT T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: MOV * T1.W, literal.y,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 87, @51, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 92(1.289195e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 76(1.064987e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 124(1.737610e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 108(1.513402e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 100(1.401298e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 156(2.186026e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 140(1.961818e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 188(2.634441e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 172(2.410233e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 220(3.082857e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 204(2.858649e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 252(3.531272e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 236(3.307064e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 228(3.194960e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T1.W,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_zextload_v32i16_to_v32i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v1, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[3:6], v1
|
|
; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v52, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8
|
|
; VI-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7
|
|
; VI-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10
|
|
; VI-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9
|
|
; VI-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9
|
|
; VI-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32
|
|
; VI-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48
|
|
; VI-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4
|
|
; VI-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7
|
|
; VI-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9
|
|
; VI-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30
|
|
; VI-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32
|
|
; VI-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31
|
|
; VI-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v31, 0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v49, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v51, v31
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29
|
|
; VI-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v46, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v48, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v27, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v29, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v43, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v45, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v24, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v26, v31
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10
|
|
; VI-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v40, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v42, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v21, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v23, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v37, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v39, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v18, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v20, v31
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8
|
|
; VI-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v10, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v34, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v36, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v15, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v17, v31
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3
|
|
; VI-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
|
; VI-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v5, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v7, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v33, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v12, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v14, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v1, v31
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v3, v31
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[11:14]
|
|
; VI-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_zextload_v32i16_to_v32i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v1, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[3:6], v1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v52, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v8
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v17, 0xffff, v8
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v22, 16, v7
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v20, 0xffff, v7
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v25, 16, v10
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v23, 0xffff, v10
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v28, 16, v9
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v26, 0xffff, v9
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[7:10], v1 offset:32
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[29:32], v1 offset:48
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v14, 0xffff, v6
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v4
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v0, 0xffff, v4
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v38, 16, v7
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v36, 0xffff, v7
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v44, 16, v9
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v42, 0xffff, v9
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v30
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v7, 0xffff, v30
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v50, 16, v32
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v48, 0xffff, v32
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v32, 16, v31
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v30, 0xffff, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v31, 0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v49, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v51, v31
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v47, 16, v29
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v45, 0xffff, v29
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[48:51] offset:240
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v46, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v48, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v27, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v29, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[45:48] offset:192
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v43, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v45, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[26:29] offset:96
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v24, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v26, v31
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v41, 16, v10
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v39, 0xffff, v10
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[42:45] offset:160
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v40, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v42, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[23:26] offset:112
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v21, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v23, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[39:42] offset:176
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v37, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v39, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[20:23] offset:64
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v18, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v20, v31
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v35, 16, v8
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v33, 0xffff, v8
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v8, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v10, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[36:39] offset:128
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v34, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v36, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[17:20] offset:80
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v15, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v17, v31
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v13, 16, v3
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v11, 0xffff, v3
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
|
; GFX9-DS128-NEXT: v_and_b32_e32 v4, 0xffff, v5
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[7:10] offset:208
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[33:36] offset:144
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v5, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v7, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v33, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[14:17] offset:48
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v12, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v14, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v1, v31
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v3, v31
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[4:7] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[30:33] offset:224
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[11:14]
|
|
; GFX9-DS128-NEXT: ds_write_b128 v52, v[0:3] offset:16
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <32 x i16>, ptr addrspace(3) %in
|
|
%ext = zext <32 x i16> %load to <32 x i64>
|
|
store <32 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; SI-LABEL: local_sextload_v32i16_to_v32i64:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v12, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[4:7], v12 offset0:2 offset1:3
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v12 offset1:1
|
|
; SI-NEXT: ds_read2_b64 v[8:11], v12 offset0:6 offset1:7
|
|
; SI-NEXT: ds_read2_b64 v[12:15], v12 offset0:4 offset1:5
|
|
; SI-NEXT: s_waitcnt lgkmcnt(3)
|
|
; SI-NEXT: v_mov_b32_e32 v18, v7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v7
|
|
; SI-NEXT: v_bfe_i32 v18, v18, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; SI-NEXT: v_mov_b32_e32 v7, s0
|
|
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:14 offset1:15
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v5
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v5
|
|
; SI-NEXT: v_bfe_i32 v18, v5, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:10 offset1:11
|
|
; SI-NEXT: s_waitcnt lgkmcnt(4)
|
|
; SI-NEXT: v_mov_b32_e32 v5, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v3
|
|
; SI-NEXT: v_bfe_i32 v18, v5, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:6 offset1:7
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v1
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v1
|
|
; SI-NEXT: v_bfe_i32 v18, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:2 offset1:3
|
|
; SI-NEXT: s_waitcnt lgkmcnt(5)
|
|
; SI-NEXT: v_mov_b32_e32 v1, v11
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v11
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v11
|
|
; SI-NEXT: v_bfe_i32 v18, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:30 offset1:31
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v9
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 16, v9
|
|
; SI-NEXT: v_bfe_i32 v18, v9, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; SI-NEXT: ds_write2_b64 v7, v[18:19], v[16:17] offset0:26 offset1:27
|
|
; SI-NEXT: s_waitcnt lgkmcnt(6)
|
|
; SI-NEXT: v_mov_b32_e32 v1, v15
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v15
|
|
; SI-NEXT: v_bfe_i32 v17, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:22 offset1:23
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v13
|
|
; SI-NEXT: v_ashrrev_i32_e32 v15, 16, v13
|
|
; SI-NEXT: v_bfe_i32 v17, v13, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; SI-NEXT: ds_write2_b64 v7, v[17:18], v[15:16] offset0:18 offset1:19
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
|
|
; SI-NEXT: v_bfe_i32 v5, v6, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; SI-NEXT: v_bfe_i32 v15, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:12 offset1:13
|
|
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
|
|
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2
|
|
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v10
|
|
; SI-NEXT: v_bfe_i32 v3, v4, 0, 16
|
|
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v8
|
|
; SI-NEXT: v_bfe_i32 v5, v1, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; SI-NEXT: ds_write2_b64 v7, v[3:4], v[5:6] offset0:8 offset1:9
|
|
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v14
|
|
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v12
|
|
; SI-NEXT: v_bfe_i32 v1, v12, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v3, v14, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v5, v8, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v8, v10, 0, 16
|
|
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v0
|
|
; SI-NEXT: v_bfe_i32 v9, v0, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v10, v2, 0, 16
|
|
; SI-NEXT: v_bfe_i32 v12, v11, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; SI-NEXT: ds_write2_b64 v7, v[10:11], v[12:13] offset0:4 offset1:5
|
|
; SI-NEXT: v_bfe_i32 v11, v6, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; SI-NEXT: v_bfe_i32 v13, v4, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; SI-NEXT: v_bfe_i32 v15, v15, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; SI-NEXT: v_bfe_i32 v16, v14, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; SI-NEXT: ds_write2_b64 v7, v[9:10], v[16:17] offset1:1
|
|
; SI-NEXT: v_bfe_i32 v17, v18, 0, 16
|
|
; SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; SI-NEXT: ds_write2_b64 v7, v[8:9], v[17:18] offset0:28 offset1:29
|
|
; SI-NEXT: ds_write2_b64 v7, v[5:6], v[15:16] offset0:24 offset1:25
|
|
; SI-NEXT: ds_write2_b64 v7, v[3:4], v[13:14] offset0:20 offset1:21
|
|
; SI-NEXT: ds_write2_b64 v7, v[1:2], v[11:12] offset0:16 offset1:17
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v7, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v7 offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[12:15], v7 offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v11, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[3:6], v7 offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[7:10], v7 offset1:1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:30 offset1:31
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v2
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:28 offset1:29
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[1:2], v[16:17] offset0:26 offset1:27
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v0
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[18:19] offset0:24 offset1:25
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(6)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v15
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v18, v15, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v14
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[18:19], v[16:17] offset0:22 offset1:23
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v17, v14, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v14, 16, v13
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:20 offset1:21
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v14, v14, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v16, v13, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[16:17], v[14:15] offset0:18 offset1:19
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v12
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v15, v12, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v17, v16, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[17:18] offset0:16 offset1:17
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v15, 16, v6
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v15, v15, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v17, v6, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[15:16] offset0:14 offset1:15
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v12, 16, v4
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(9)
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[5:6], v[15:16] offset0:12 offset1:13
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v5, v12, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[15:16], v[5:6] offset0:10 offset1:11
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v3
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v17, v3, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v21, v0, 0, 16
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v10
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v22, 31, v21
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; VI-NO-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v7
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v19, v19, 0, 16
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[21:22] offset0:8 offset1:9
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v1, v1, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v4, v7, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16
|
|
; VI-NO-DS128-NEXT: v_bfe_i32 v8, v9, 0, 16
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[17:18], v[19:20] offset0:6 offset1:7
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[8:9], v[15:16] offset0:4 offset1:5
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[6:7], v[13:14] offset0:2 offset1:3
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v11, v[4:5], v[1:2] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_sextload_v32i16_to_v32i64:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v8, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[4:7], v8 offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v8 offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v15, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(1)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v9, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v9, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v7, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[11:14], v8 offset1:1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[7:10], v8 offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:30 offset1:31
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v16, 16, v6
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v16, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v6, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[18:19], v[16:17] offset0:28 offset1:29
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[5:6], v[16:17] offset0:26 offset1:27
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v17, 16, v4
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v17, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(5)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:24 offset1:25
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:22 offset1:23
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:20 offset1:21
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v1, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[2:3] offset0:18 offset1:19
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[16:17] offset0:16 offset1:17
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(8)
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v10
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v10, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[3:4] offset0:14 offset1:15
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v9
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v3, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v9, v9, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[9:10], v[3:4] offset0:12 offset1:13
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v8, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v20, 16, v12
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[3:4] offset0:10 offset1:11
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v1, v20, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v7, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v20, v4, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v18, 16, v13
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v19, 16, v14
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, v14
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v21, 31, v20
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v11
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v8, v12, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v12, v18, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v18, v19, 0, 16
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[20:21] offset0:8 offset1:9
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v16, v0, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v5, v5, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v3, v11, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_bfe_i32 v10, v13, 0, 16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v19, 31, v18
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v6, 31, v5
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v2, 31, v1
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v13, 31, v12
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v4, 31, v3
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; GFX9-NO-DS128-NEXT: v_ashrrev_i32_e32 v11, 31, v10
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[16:17], v[18:19] offset0:6 offset1:7
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[10:11], v[12:13] offset0:4 offset1:5
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[8:9], v[1:2] offset0:2 offset1:3
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v15, v[3:4], v[5:6] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_sextload_v32i16_to_v32i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 107, @52, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T1.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T1.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T1.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 28(3.923636e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T2.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T2.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T2.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T3.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 44(6.165713e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T3.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 40(5.605194e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T3.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T4.Y, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T4.Z, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 60(8.407791e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T4.W, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T5.Y, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T5.Z, OQAP,
|
|
; EG-NEXT: BFE_INT T0.W, T5.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T5.W
|
|
; EG-NEXT: MOV * T5.W, OQAP,
|
|
; EG-NEXT: BFE_INT T0.Z, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T0.W, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 20(2.802597e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T6.Z, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T0.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T7.Z, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T6.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 52(7.286752e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T8.Z, T1.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T7.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 36(5.044674e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T9.Z, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T8.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 84(1.177091e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T10.Z, T2.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T9.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 68(9.528830e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T11.Z, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T10.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 116(1.625506e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT * T12.Z, T2.W, 0.0, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 98, @53, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ASHR T6.W, T11.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 100(1.401298e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T13.Z, T3.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T12.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 148(2.073922e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T14.Z, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T13.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 132(1.849714e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T15.Z, T3.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T14.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 180(2.522337e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T16.Z, T4.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T15.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 164(2.298129e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T17.Z, T4.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T16.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 212(2.970753e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T18.Z, T4.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T17.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 196(2.746545e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: BFE_INT T19.Z, T5.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T6.W, T18.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 244(3.419168e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: ASHR T6.W, T19.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 228(3.194960e-43)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: ASHR T6.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 28(3.923636e-44)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: ASHR T6.W, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: LDS_WRITE * T7.W, T6.W,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T5.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 12(1.681558e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T5.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.Z,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 60(8.407791e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T0.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 56(7.847271e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T6.Z,
|
|
; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 44(6.165713e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T1.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 40(5.605194e-44)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T7.Z,
|
|
; EG-NEXT: ASHR T0.W, T1.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 92(1.289195e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ASHR * T0.W, T1.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 99, @54, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 88(1.233143e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T8.Z,
|
|
; EG-NEXT: ASHR T0.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 76(1.064987e-43)
|
|
; EG-NEXT: LDS_WRITE * T6.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T1.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 72(1.008935e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T9.Z,
|
|
; EG-NEXT: ASHR T0.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 124(1.737610e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T2.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 120(1.681558e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T10.Z,
|
|
; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 108(1.513402e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T2.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 104(1.457350e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T11.Z,
|
|
; EG-NEXT: ASHR T0.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 156(2.186026e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T2.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 152(2.129974e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T12.Z,
|
|
; EG-NEXT: ASHR T0.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 140(1.961818e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T3.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 136(1.905766e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T13.Z,
|
|
; EG-NEXT: ASHR T0.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 188(2.634441e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T3.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 184(2.578389e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 176(2.466285e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T14.Z,
|
|
; EG-NEXT: ASHR T0.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 172(2.410233e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T3.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 168(2.354181e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T15.Z,
|
|
; EG-NEXT: ASHR T0.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 220(3.082857e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T4.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 216(3.026805e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T16.Z,
|
|
; EG-NEXT: ASHR T0.W, T4.Z, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 204(2.858649e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR * T0.W, T4.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU 27, @55, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 200(2.802597e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T17.Z,
|
|
; EG-NEXT: ASHR T0.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 252(3.531272e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T4.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 248(3.475220e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T18.Z,
|
|
; EG-NEXT: ASHR T0.W, T5.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 236(3.307064e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ASHR T0.W, T5.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 232(3.251012e-43)
|
|
; EG-NEXT: LDS_WRITE * T1.W, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T19.Z,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_sextload_v32i16_to_v32i64:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v4 offset:48
|
|
; VI-DS128-NEXT: ds_read_b128 v[9:12], v4 offset:32
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v8, s0
|
|
; VI-DS128-NEXT: ds_read_b128 v[17:20], v4 offset:16
|
|
; VI-DS128-NEXT: ds_read_b128 v[4:7], v4
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v2, v3
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:224
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v2, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v15, v3, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:240
|
|
; VI-DS128-NEXT: v_bfe_i32 v15, v2, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v1, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:208
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(5)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v11
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v11, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:192
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v13, v12
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:160
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v12
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v13, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v10
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:176
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v9, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v9
|
|
; VI-DS128-NEXT: v_bfe_i32 v9, v10, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:144
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(8)
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v11, 16, v19
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: v_bfe_i32 v9, v19, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v11, 0, 16
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:128
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(8)
|
|
; VI-DS128-NEXT: v_bfe_i32 v0, v5, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v5
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v5, v20
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:96
|
|
; VI-DS128-NEXT: v_bfe_i32 v9, v5, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v20
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v17
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:112
|
|
; VI-DS128-NEXT: v_bfe_i32 v9, v17, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v5, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v5, 16, v18
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[9:12] offset:64
|
|
; VI-DS128-NEXT: v_bfe_i32 v9, v4, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v15, v5, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v11, v4, 0, 16
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, v7
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:80
|
|
; VI-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v4, 16, v7
|
|
; VI-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v6
|
|
; VI-DS128-NEXT: v_bfe_i32 v15, v4, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v4, v6, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v6, v7, 0, 16
|
|
; VI-DS128-NEXT: v_bfe_i32 v2, v2, 0, 16
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v10, 31, v9
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v12, 31, v11
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; VI-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[4:7] offset:32
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[13:16] offset:48
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[9:12]
|
|
; VI-DS128-NEXT: ds_write_b128 v8, v[0:3] offset:16
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_sextload_v32i16_to_v32i64:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v13, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[4:7], v13 offset:48
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v13 offset:32
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v12, s0
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[8:11], v13
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[18:21], v13 offset:16
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(3)
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v14, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v6
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v16, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v15, 31, v14
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v17, 31, v16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v6, v7
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[14:17] offset:224
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v7, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:240
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v5
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v4, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v5, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:208
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(5)
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:192
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v13, v3
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:160
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v13, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v2, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[4:7] offset:176
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v6, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(6)
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v20
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v9
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:144
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v20, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:128
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v4, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v21
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:96
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v21
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v18
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:112
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v18, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v1, 16, v19
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:64
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v8
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v19, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v1, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v6, v8, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v8, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, v11
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:80
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v13, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v11
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v15, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_lshrrev_b32_e32 v0, 16, v10
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v17, v10, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v19, v0, 0, 16
|
|
; GFX9-DS128-NEXT: v_bfe_i32 v2, v9, 0, 16
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v18, 31, v17
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v20, 31, v19
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v9, 31, v8
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v14, 31, v13
|
|
; GFX9-DS128-NEXT: v_ashrrev_i32_e32 v16, 31, v15
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[17:20] offset:32
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[13:16] offset:48
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[6:9]
|
|
; GFX9-DS128-NEXT: ds_write_b128 v12, v[2:5] offset:16
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%load = load <32 x i16>, ptr addrspace(3) %in
|
|
%ext = sext <32 x i16> %load to <32 x i64>
|
|
store <32 x i64> %ext, ptr addrspace(3) %out
|
|
ret void
|
|
}
|
|
|
|
; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
|
|
; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; %load = load <64 x i16>, ptr addrspace(3) %in
|
|
; %ext = zext <64 x i16> %load to <64 x i64>
|
|
; store <64 x i64> %ext, ptr addrspace(3) %out
|
|
; ret void
|
|
; }
|
|
|
|
; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
|
|
; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(ptr addrspace(3) %out, ptr addrspace(3) %in) #0 {
|
|
; %load = load <64 x i16>, ptr addrspace(3) %in
|
|
; %ext = sext <64 x i16> %load to <64 x i64>
|
|
; store <64 x i64> %ext, ptr addrspace(3) %out
|
|
; ret void
|
|
; }
|
|
|
|
; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
|
|
define amdgpu_kernel void @local_v8i16_to_128(ptr addrspace(3) %out, ptr addrspace(3) %in) {
|
|
; SI-LABEL: local_v8i16_to_128:
|
|
; SI: ; %bb.0:
|
|
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: v_mov_b32_e32 v0, s1
|
|
; SI-NEXT: s_mov_b32 m0, -1
|
|
; SI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; SI-NEXT: v_mov_b32_e32 v4, s0
|
|
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; SI-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; SI-NEXT: s_endpgm
|
|
;
|
|
; VI-NO-DS128-LABEL: local_v8i16_to_128:
|
|
; VI-NO-DS128: ; %bb.0:
|
|
; VI-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-NO-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; VI-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; VI-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-NO-DS128-LABEL: local_v8i16_to_128:
|
|
; GFX9-NO-DS128: ; %bb.0:
|
|
; GFX9-NO-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-NO-DS128-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
|
|
; GFX9-NO-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-NO-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NO-DS128-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
|
|
; GFX9-NO-DS128-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: local_v8i16_to_128:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 25, @56, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Z,
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: MOV * T0.W, KC0[2].Y,
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Z, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_READ_RET * OQAP, T0.W
|
|
; EG-NEXT: MOV T0.X, OQAP,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 12(1.681558e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LDS_WRITE * T0.W, T0.X,
|
|
; EG-NEXT: RETURN
|
|
;
|
|
; VI-DS128-LABEL: local_v8i16_to_128:
|
|
; VI-DS128: ; %bb.0:
|
|
; VI-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; VI-DS128-NEXT: s_mov_b32 m0, -1
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; VI-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; VI-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; VI-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; VI-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; VI-DS128-NEXT: s_endpgm
|
|
;
|
|
; GFX9-DS128-LABEL: local_v8i16_to_128:
|
|
; GFX9-DS128: ; %bb.0:
|
|
; GFX9-DS128-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX9-DS128-NEXT: ds_read_b128 v[0:3], v0
|
|
; GFX9-DS128-NEXT: v_mov_b32_e32 v4, s0
|
|
; GFX9-DS128-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-DS128-NEXT: ds_write_b128 v4, v[0:3]
|
|
; GFX9-DS128-NEXT: s_endpgm
|
|
%ld = load <8 x i16>, ptr addrspace(3) %in, align 16
|
|
store <8 x i16> %ld, ptr addrspace(3) %out, align 16
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind }
|