
Similar to InstCombinerImpl::freezeOtherUses, attempt to ensure that we merge multiple frozen/unfrozen uses of a SDValue. This fixes a number of hasOneUse() problems when trying to push FREEZE nodes through the DAG. Remove SimplifyMultipleUseDemandedBits handling of FREEZE nodes as we now want to keep the common node, and not bypass for some nodes just because of DemandedElts. Fixes #149799
13663 lines
606 KiB
LLVM
13663 lines
606 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
|
|
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s
|
|
|
|
; TODO: NOT AND
|
|
define amdgpu_kernel void @global_load_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_load_i8:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_load_i8:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_byte v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_load_i8:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_load_i8:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
|
|
; EG-NEXT: 3(4.203895e-45), 255(3.573311e-43)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
|
|
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T0.Y, 0.0,
|
|
; EG-NEXT: MOV * T0.Z, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_load_i8:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 255(3.573311e-43), 3(4.203895e-45)
|
|
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
|
|
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, 0.0,
|
|
; CM-NEXT: MOV * T0.Z, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load i8, ptr addrspace(1) %in
|
|
store i8 %ld, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_load_v2i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_load_v2i8:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_load_v2i8:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_load_v2i8:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_load_v2i8:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
|
|
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
|
|
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T0.Y, 0.0,
|
|
; EG-NEXT: MOV * T0.Z, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_load_v2i8:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
|
|
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
|
|
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, 0.0,
|
|
; CM-NEXT: MOV * T0.Z, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load <2 x i8>, ptr addrspace(1) %in
|
|
store <2 x i8> %ld, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_load_v3i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_load_v3i8:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_load_v3i8:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 2
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_byte v[0:1], v3
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_load_v3i8:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_byte v1, off, s[4:7], 0 offset:2
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_load_v3i8:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 27, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T6.XW, T8.X
|
|
; EG-NEXT: MEM_RAT MSKOR T5.XW, T7.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T2.X, T5.X,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
|
|
; EG-NEXT: MOV * T2.W, literal.y,
|
|
; EG-NEXT: 3(4.203895e-45), 8(1.121039e-44)
|
|
; EG-NEXT: BFE_UINT T2.W, T0.Y, literal.x, PS,
|
|
; EG-NEXT: LSHL * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 3(4.203895e-45)
|
|
; EG-NEXT: LSHL T6.X, PV.W, PS,
|
|
; EG-NEXT: LSHL * T6.W, literal.x, PS,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T6.Y, 0.0,
|
|
; EG-NEXT: AND_INT T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
|
|
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL T5.X, T2.W, PV.W,
|
|
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.Y, 0.0,
|
|
; EG-NEXT: MOV T6.Z, 0.0,
|
|
; EG-NEXT: MOV * T5.Z, 0.0,
|
|
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: LSHR * T8.X, T0.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_load_v3i8:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 28, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T6.XW, T8.X
|
|
; CM-NEXT: MEM_RAT MSKOR T5.XW, T7.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T5.X, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T2.X, T5.X,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: MOV * T1.W, literal.y,
|
|
; CM-NEXT: 3(4.203895e-45), 8(1.121039e-44)
|
|
; CM-NEXT: BFE_UINT T1.Z, T0.Y, literal.x, PV.W,
|
|
; CM-NEXT: LSHL * T1.W, PV.Z, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 3(4.203895e-45)
|
|
; CM-NEXT: LSHL T6.X, PV.Z, PV.W,
|
|
; CM-NEXT: LSHL * T6.W, literal.x, PV.W,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T6.Y, 0.0,
|
|
; CM-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
|
|
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
|
|
; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
|
|
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
|
|
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T5.Y, 0.0,
|
|
; CM-NEXT: MOV * T6.Z, 0.0,
|
|
; CM-NEXT: MOV * T5.Z, 0.0,
|
|
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load <3 x i8>, ptr addrspace(1) %in
|
|
store <3 x i8> %ld, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_load_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_load_v4i8:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_load_v4i8:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_load_v4i8:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_load_v4i8:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_load_v4i8:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load <4 x i8>, ptr addrspace(1) %in
|
|
store <4 x i8> %ld, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_load_v8i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_load_v8i8:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_load_v8i8:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_load_v8i8:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_load_v8i8:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_load_v8i8:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load <8 x i8>, ptr addrspace(1) %in
|
|
store <8 x i8> %ld, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_load_v16i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_load_v16i8:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_load_v16i8:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_load_v16i8:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_load_v16i8:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_load_v16i8:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load <16 x i8>, ptr addrspace(1) %in
|
|
store <16 x i8> %ld, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_i8_to_i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_i8_to_i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_i8_to_i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_i8_to_i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_i8_to_i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%a = load i8, ptr addrspace(1) %in
|
|
%ext = zext i8 %a to i32
|
|
store i32 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_i8_to_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_i8_to_i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_i8_to_i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_sbyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_i8_to_i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_i8_to_i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
|
|
;
|
|
; CM-LABEL: global_sextload_i8_to_i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%ld = load i8, ptr addrspace(1) %in
|
|
%ext = sext i8 %ld to i32
|
|
store i32 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v1i8_to_v1i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v1i8_to_v1i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v1i8_to_v1i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v1i8_to_v1i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v1i8_to_v1i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <1 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <1 x i8> %load to <1 x i32>
|
|
store <1 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v1i8_to_v1i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v1i8_to_v1i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_sbyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v1i8_to_v1i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v1i8_to_v1i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
|
|
;
|
|
; CM-LABEL: global_sextload_v1i8_to_v1i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <1 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <1 x i8> %load to <1 x i32>
|
|
store <1 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v2i8_to_v2i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v2i8_to_v2i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 8, v2
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff, v2
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v2i8_to_v2i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 8
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v2i8_to_v2i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 12, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T2.X,
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T4.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T2.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: MOV * T1.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T4.Y, PV.Y, literal.x, PV.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T4.X, T0.W, literal.x,
|
|
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
|
|
;
|
|
; CM-LABEL: global_zextload_v2i8_to_v2i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 13, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T2.X,
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T2.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: MOV * T1.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T4.Y, PV.Y, literal.x, PV.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT * T4.X, T0.W, literal.x,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <2 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <2 x i8> %load to <2 x i32>
|
|
store <2 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v2i8_to_v2i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v2i8_to_v2i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v2i8_to_v2i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v2i8_to_v2i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 11, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T2.X,
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T4.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T2.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, PV.X,
|
|
; EG-NEXT: BFE_INT T4.X, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T0.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
|
|
; EG-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_sextload_v2i8_to_v2i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 11, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T2.X,
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T2.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, PV.X,
|
|
; CM-NEXT: BFE_INT T4.X, T0.W, 0.0, literal.x,
|
|
; CM-NEXT: LSHR * T0.W, PV.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T4.Y, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
%load = load <2 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <2 x i8> %load to <2 x i32>
|
|
store <2 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v3i8_to_v3i32:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v3i8_to_v3i32:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_bfe_u32 v1, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v2, v2, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v3i8_to_v3i32:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v3i8_to_v3i32:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T7.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T6.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T5.Y, T4.X, literal.x, PV.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T5.X, T4.X, literal.x,
|
|
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
|
|
; EG-NEXT: BFE_UINT T4.X, T4.X, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v3i8_to_v3i32:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T7.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T6.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T5.X, T4.X, literal.x, PV.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T6.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT * T4.Y, T4.X, literal.y, T0.W,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: AND_INT * T4.X, T4.X, literal.x,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
entry:
|
|
%ld = load <3 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <3 x i8> %ld to <3 x i32>
|
|
store <3 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v3i8_to_v3i32:
|
|
; GCN-NOHSA-SI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v3i8_to_v3i32:
|
|
; GCN-HSA: ; %bb.0: ; %entry
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx3 v[3:4], v[0:2]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v3i8_to_v3i32:
|
|
; GCN-NOHSA-VI: ; %bb.0: ; %entry
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v3i8_to_v3i32:
|
|
; EG: ; %bb.0: ; %entry
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T4.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XY, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT * T6.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T7.X, T4.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T0.W, T4.X, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T4.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v3i8_to_v3i32:
|
|
; CM: ; %bb.0: ; %entry
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T4.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T6.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHR * T0.W, T4.X, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T5.X, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T6.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T7.X, T4.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR * T0.W, T4.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
entry:
|
|
%ld = load <3 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <3 x i8> %ld to <3 x i32>
|
|
store <3 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v4i8_to_v4i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v1, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v2, v2, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v4i8_to_v4i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 9, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, PV.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T4.Y, T4.X, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T4.W, T4.X, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
|
|
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
|
|
;
|
|
; CM-LABEL: global_zextload_v4i8_to_v4i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, PV.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T4.Y, T4.X, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T4.W, T4.X, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: AND_INT * T4.X, T4.X, literal.x,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <4 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <4 x i8> %load to <4 x i32>
|
|
store <4 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v4i8_to_v4i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v4i8_to_v4i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v4i8_to_v4i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v4i8_to_v4i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T4.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T4.X, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T5.Z, PS, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T4.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v4i8_to_v4i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T4.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR * T0.W, T4.X, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: LSHR T0.Z, T4.X, literal.x,
|
|
; CM-NEXT: BFE_INT * T5.W, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; CM-NEXT: BFE_INT T5.Z, PV.Z, 0.0, literal.x,
|
|
; CM-NEXT: LSHR * T0.W, T4.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
%load = load <4 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <4 x i8> %load to <4 x i32>
|
|
store <4 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v8, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v9
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v9, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v8, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v9
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v9, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v9
|
|
; GCN-HSA-NEXT: v_bfe_u32 v5, v9, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v9
|
|
; GCN-HSA-NEXT: v_bfe_u32 v6, v9, 16, 8
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v1, v8, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v2, v8, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v8i8_to_v8i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v9
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v9, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v9
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v9, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v8, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v8, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v8i8_to_v8i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, PV.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T6.Y, T5.X, literal.x, T0.W,
|
|
; EG-NEXT: BFE_UINT T7.Z, T5.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T6.W, T5.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T6.X, T5.X, literal.x,
|
|
; EG-NEXT: BFE_UINT T7.Y, T5.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR * T7.W, T5.Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T7.X, T5.Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v8i8_to_v8i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T6.Z, T5.Y, literal.x, PV.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T6.Y, T5.Y, literal.x, T0.W,
|
|
; CM-NEXT: BFE_UINT T5.Z, T5.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T6.W, T5.Y, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T6.X, T5.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T5.Y, T5.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T7.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR * T5.W, T5.X, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: AND_INT * T5.X, T5.X, literal.x,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <8 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <8 x i8> %load to <8 x i32>
|
|
store <8 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v8i8_to_v8i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[7:8], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v7
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v7, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v7, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 24, v8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v8, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v8, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v8, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[7:8], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 24, v7
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v1, v7, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v0, v7, 0, 8
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v8, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v5, v8, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[7:8], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v7
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v7, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v7, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v8, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v8, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v8, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v8i8_to_v8i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 23, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T6.X, T5.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T0.Z, T5.Y, literal.y,
|
|
; EG-NEXT: BFE_INT T6.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T5.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T0.Y, T5.Y, literal.x,
|
|
; EG-NEXT: BFE_INT T6.Z, PS, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T7.W, PV.Z, 0.0, literal.y,
|
|
; EG-NEXT: LSHR * T0.W, T5.X, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: BFE_INT T6.Y, PS, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T7.Z, PV.Y, 0.0, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T5.Y, literal.y,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T8.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v8i8_to_v8i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 23, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T5.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T9.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT T6.X, T5.Y, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T0.Z, T5.X, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T5.Y, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T7.X, T5.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T0.Y, T5.X, literal.y,
|
|
; CM-NEXT: LSHR T1.Z, T5.Y, literal.z,
|
|
; CM-NEXT: BFE_INT * T6.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T8.X, T5.Y, literal.x,
|
|
; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.y,
|
|
; CM-NEXT: BFE_INT T6.Z, PV.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T7.W, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T9.X, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_INT T6.Y, PV.X, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T7.Z, T0.Z, 0.0, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T5.X, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T7.Y, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
%load = load <8 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <8 x i8> %load to <8 x i32>
|
|
store <8 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v16i8_to_v16i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v1, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v3, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v1, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v3, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v3
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 16, 8
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v4, v0, 8, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 24, v2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v1
|
|
; GCN-HSA-NEXT: v_bfe_u32 v8, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v5, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v1
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v1, 16, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[3:6]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v3, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v3, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v1, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v1, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v16i8_to_v16i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @8
|
|
; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T13.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T11.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 8:
|
|
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 11:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T8.Z, T7.X, literal.x, PV.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T8.Y, T7.X, literal.x, T0.W,
|
|
; EG-NEXT: BFE_UINT T9.Z, T7.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T8.W, T7.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
|
|
; EG-NEXT: BFE_UINT T9.Y, T7.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T10.Z, T7.Z, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T9.W, T7.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: AND_INT T9.X, T7.Y, literal.x,
|
|
; EG-NEXT: BFE_UINT T10.Y, T7.Z, literal.y, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T11.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T12.Z, T7.W, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T10.W, T7.Z, literal.z,
|
|
; EG-NEXT: AND_INT * T10.X, T7.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T12.Y, T7.W, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T13.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T12.W, T7.W, literal.y,
|
|
; EG-NEXT: AND_INT * T12.X, T7.W, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v16i8_to_v16i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @8
|
|
; CM-NEXT: ALU 40, @11, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T13.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T12.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 8:
|
|
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 11:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T8.Z, T7.W, literal.x, PV.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T8.Y, T7.W, literal.x, T0.W,
|
|
; CM-NEXT: BFE_UINT T9.Z, T7.Z, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T8.W, T7.W, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T8.X, T7.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T9.Y, T7.Z, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T10.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T11.Z, T7.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T9.W, T7.Z, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T9.X, T7.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T11.Y, T7.Y, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T12.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T7.Z, T7.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T11.W, T7.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T11.X, T7.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T7.Y, T7.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T13.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR * T7.W, T7.X, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: AND_INT * T7.X, T7.X, literal.x,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T14.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <16 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <16 x i8> %load to <16 x i32>
|
|
store <16 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v16i8_to_v16i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v1, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v1, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v2, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v3, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v3, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v3, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1
|
|
; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v3, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v3, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v1, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v1, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v16i8_to_v16i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @8
|
|
; EG-NEXT: ALU 47, @11, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T14.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T13.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T7.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T8.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 8:
|
|
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 11:
|
|
; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: LSHR T0.W, T7.W, literal.y,
|
|
; EG-NEXT: LSHR * T1.W, T7.Z, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T0.Y, T7.W, literal.y,
|
|
; EG-NEXT: LSHR T0.Z, T7.Z, literal.z,
|
|
; EG-NEXT: LSHR T2.W, T7.Y, literal.x,
|
|
; EG-NEXT: LSHR * T3.W, T7.X, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T10.X, T7.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T1.Y, T7.Z, literal.y,
|
|
; EG-NEXT: LSHR T1.Z, T7.Y, literal.y,
|
|
; EG-NEXT: BFE_INT T9.W, PS, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T3.W, T7.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T11.X, T7.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T2.Y, T7.Y, literal.y,
|
|
; EG-NEXT: BFE_INT T9.Z, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T10.W, PV.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T3.W, T7.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T12.X, T7.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T9.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T10.Z, PV.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T11.W, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T7.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T10.Y, T2.W, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T11.Z, T0.Z, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T12.W, T0.Y, 0.0, literal.y,
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T13.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T12.Z, T0.W, 0.0, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: LSHR T0.W, T7.W, literal.y, BS:VEC_201
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T14.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT * T12.Y, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v16i8_to_v16i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @8
|
|
; CM-NEXT: ALU 48, @11, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T7.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T15.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T14.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T9.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 8:
|
|
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 11:
|
|
; CM-NEXT: LSHR * T0.W, T7.X, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T8.X, T7.Y, literal.x,
|
|
; CM-NEXT: ADD_INT T0.Y, KC0[2].Y, literal.y,
|
|
; CM-NEXT: LSHR T0.Z, T7.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T1.Y, T7.Y, literal.y,
|
|
; CM-NEXT: LSHR T1.Z, T7.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; CM-NEXT: BFE_INT T10.X, T7.W, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T2.Y, T7.Y, literal.y,
|
|
; CM-NEXT: LSHR T2.Z, T7.Z, literal.z,
|
|
; CM-NEXT: LSHR * T2.W, T7.W, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T11.X, T7.Z, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T3.Y, T7.Z, literal.y,
|
|
; CM-NEXT: LSHR T3.Z, T7.W, literal.z,
|
|
; CM-NEXT: BFE_INT * T10.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T12.X, T7.Y, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T4.Y, T7.W, literal.x,
|
|
; CM-NEXT: BFE_INT T10.Z, PV.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T11.W, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T13.X, T7.X, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T10.Y, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T11.Z, T2.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T12.W, T2.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T14.X, T1.W, literal.x,
|
|
; CM-NEXT: BFE_INT T11.Y, T1.Z, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T12.Z, T1.Y, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT * T13.W, T0.Z, 0.0, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T15.X, T0.Y, literal.x,
|
|
; CM-NEXT: BFE_INT T12.Y, T8.X, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T13.Z, T0.W, 0.0, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T7.X, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T13.Y, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
%load = load <16 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <16 x i8> %load to <16 x i32>
|
|
store <16 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v32i8_to_v32i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v3, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v1, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v3, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v4, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v5
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v5, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v6
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v6, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v7
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v7, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v4
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v4, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v5
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v5, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xff, v6
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v30, v6, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v7
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v7, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v7
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v7
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v6
|
|
; GCN-HSA-NEXT: v_bfe_u32 v8, v6, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v6
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v6, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[7:10]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5
|
|
; GCN-HSA-NEXT: v_bfe_u32 v7, v5, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v5
|
|
; GCN-HSA-NEXT: v_bfe_u32 v8, v5, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[6:9]
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v4
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 24, v4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v6, v4, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v7, v4, 16, 8
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[5:8]
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v3
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_u32 v5, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v6, v3, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v1
|
|
; GCN-HSA-NEXT: v_bfe_u32 v4, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff, v1
|
|
; GCN-HSA-NEXT: v_bfe_u32 v5, v1, 16, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v14, v2, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 24, v7
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v33, v7, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xff, v7
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v34, v7, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v1, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v3, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v1, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v3, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v4, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v5
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v5, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v6
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v6, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v4
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v4, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xff, v5
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v26, v5, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xff, v6
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v30, v6, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v32i8_to_v32i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @12
|
|
; EG-NEXT: ALU 75, @17, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T25.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T23.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T12.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T20.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T18.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 12:
|
|
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
|
|
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 16:
|
|
; EG-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 17:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T13.Y, T11.X, literal.x, T0.W,
|
|
; EG-NEXT: BFE_UINT T14.Z, T11.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T13.W, T11.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T13.X, T11.X, literal.x,
|
|
; EG-NEXT: BFE_UINT T14.Y, T11.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T11.X, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T15.Z, T11.Z, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T14.W, T11.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: AND_INT T14.X, T11.Y, literal.x,
|
|
; EG-NEXT: BFE_UINT T15.Y, T11.Z, literal.y, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T17.Z, T11.W, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T15.W, T11.Z, literal.z,
|
|
; EG-NEXT: AND_INT * T15.X, T11.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T17.Y, T11.W, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T19.Z, T12.X, literal.y, T0.W, BS:VEC_021/SCL_122
|
|
; EG-NEXT: LSHR T17.W, T11.W, literal.z,
|
|
; EG-NEXT: AND_INT * T17.X, T11.W, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T19.Y, T12.X, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T21.Z, T12.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T19.W, T12.X, literal.z,
|
|
; EG-NEXT: AND_INT * T19.X, T12.X, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T21.Y, T12.Y, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T22.Z, T12.Z, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T21.W, T12.Y, literal.z,
|
|
; EG-NEXT: AND_INT * T21.X, T12.Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T22.Y, T12.Z, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
|
|
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T24.Z, T12.W, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T22.W, T12.Z, literal.z,
|
|
; EG-NEXT: AND_INT * T22.X, T12.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T24.Y, T12.W, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
|
|
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T24.W, T12.W, literal.y,
|
|
; EG-NEXT: AND_INT * T24.X, T12.W, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v32i8_to_v32i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @12
|
|
; CM-NEXT: ALU 80, @17, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T26.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T25.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T24.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T22.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T20.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T18.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T17.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T15.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 12:
|
|
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
|
|
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
|
|
; CM-NEXT: ALU clause starting at 16:
|
|
; CM-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 17:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T13.Z, T11.W, literal.x, PV.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T13.Y, T11.W, literal.x, T0.W,
|
|
; CM-NEXT: BFE_UINT T14.Z, T11.Z, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T13.W, T11.W, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T13.X, T11.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T14.Y, T11.Z, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T16.Z, T11.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T14.W, T11.Z, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T14.X, T11.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T16.Y, T11.Y, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T11.Z, T11.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T16.W, T11.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T16.X, T11.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T11.Y, T11.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T19.Z, T12.W, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T11.W, T11.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T11.X, T11.X, literal.x,
|
|
; CM-NEXT: BFE_UINT T19.Y, T12.W, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T21.Z, T12.Z, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T19.W, T12.W, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T19.X, T12.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T21.Y, T12.Z, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T23.Z, T12.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T21.W, T12.Z, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T21.X, T12.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T23.Y, T12.Y, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T24.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T12.Z, T12.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T23.W, T12.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T23.X, T12.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T12.Y, T12.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T25.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR * T12.W, T12.X, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: AND_INT * T12.X, T12.X, literal.x,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T26.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <32 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <32 x i8> %load to <32 x i32>
|
|
store <32 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG/CM should use DST, but for some there are redundant MOVs
|
|
define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v32i8_to_v32i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v1, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v1, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v1, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v3, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v3, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v3, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v4
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v4, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v4, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 24, v5
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v5, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v25, v5, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v5, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 24, v6
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v6, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v6, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v6, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 24, v7
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v7, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v7, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v7, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[7:10]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v6
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 24, v5
|
|
; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v5, v5, 0, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[5:8]
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v4
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v5, v4, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[4:7]
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v3, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v5, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[8:11]
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v2
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 24, v1
|
|
; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v14, v2, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v32i8_to_v32i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 24, v7
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v7, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v7, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v7, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v1, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v1, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v3, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v3, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v3, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v4
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v4, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v5
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v5, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v5, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 24, v6
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v6, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v6, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v6, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v32i8_to_v32i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @14
|
|
; EG-NEXT: ALU 18, @19, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @16
|
|
; EG-NEXT: ALU 75, @38, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T26.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T25.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T12.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T17.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T16.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T14.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T13.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 14:
|
|
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
|
|
; EG-NEXT: Fetch clause starting at 16:
|
|
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 18:
|
|
; EG-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 19:
|
|
; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T0.Z, T12.W, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T12.Z, literal.z,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T16.X, PS, literal.x,
|
|
; EG-NEXT: LSHR T0.Y, T12.W, literal.y,
|
|
; EG-NEXT: LSHR T1.Z, T12.Z, literal.z,
|
|
; EG-NEXT: LSHR T1.W, T12.Y, literal.w,
|
|
; EG-NEXT: LSHR * T2.W, T12.Z, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; EG-NEXT: ALU clause starting at 38:
|
|
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T1.Y, T12.Y, literal.y,
|
|
; EG-NEXT: LSHR T2.Z, T12.Y, literal.z,
|
|
; EG-NEXT: LSHR T3.W, T12.X, literal.y,
|
|
; EG-NEXT: LSHR * T4.W, T12.X, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T18.X, T11.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T2.Y, T11.W, literal.y,
|
|
; EG-NEXT: LSHR T3.Z, T11.W, literal.z,
|
|
; EG-NEXT: LSHR T5.W, T11.Z, literal.y,
|
|
; EG-NEXT: LSHR * T6.W, T11.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T19.X, T11.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T3.Y, T11.Z, literal.y,
|
|
; EG-NEXT: LSHR T4.Z, T11.Y, literal.y,
|
|
; EG-NEXT: BFE_INT T18.W, PS, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T6.W, T11.X, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T20.X, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T4.Y, T11.Y, literal.y,
|
|
; EG-NEXT: BFE_INT T18.Z, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T19.W, PV.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T6.W, T11.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T21.X, T11.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T18.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T19.Z, PV.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T20.W, T3.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T6.W, T11.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T22.X, T12.X, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T19.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T20.Z, T5.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T21.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T5.W, T11.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T20.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T21.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: BFE_INT T22.W, T4.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T4.W, T11.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T23.X, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T22.Z, T3.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T11.W, T2.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: LSHR * T3.W, T12.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T24.X, T12.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T22.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T11.Z, T1.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T23.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
|
|
; EG-NEXT: LSHR T12.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T11.Y, T1.W, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T23.Z, T1.Z, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T24.W, T0.Y, 0.0, literal.y,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T25.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T23.Y, T0.W, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T24.Z, T0.Z, 0.0, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T12.W, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T26.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT * T24.Y, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v32i8_to_v32i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @18, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @14
|
|
; CM-NEXT: ALU 19, @19, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @16
|
|
; CM-NEXT: ALU 78, @39, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T12.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T14.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T27.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T19.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T18.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T17.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T16.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T15.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 14:
|
|
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
|
|
; CM-NEXT: Fetch clause starting at 16:
|
|
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
|
|
; CM-NEXT: ALU clause starting at 18:
|
|
; CM-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 19:
|
|
; CM-NEXT: LSHR * T0.W, T12.X, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T13.X, T12.Y, literal.x,
|
|
; CM-NEXT: ADD_INT T0.Y, KC0[2].Y, literal.y,
|
|
; CM-NEXT: LSHR T0.Z, T12.X, literal.z,
|
|
; CM-NEXT: LSHR * T1.W, T12.Y, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T14.X, T12.Z, literal.x,
|
|
; CM-NEXT: ADD_INT T1.Y, KC0[2].Y, literal.y,
|
|
; CM-NEXT: LSHR T1.Z, T12.Y, literal.z,
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 112(1.569454e-43)
|
|
; CM-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T2.Y, T12.Z, literal.y,
|
|
; CM-NEXT: LSHR T2.Z, T12.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 96(1.345247e-43)
|
|
; CM-NEXT: ALU clause starting at 39:
|
|
; CM-NEXT: LSHR T16.X, T2.W, literal.x,
|
|
; CM-NEXT: LSHR T3.Y, T12.Z, literal.y,
|
|
; CM-NEXT: LSHR T3.Z, T12.W, literal.z, BS:VEC_120/SCL_212
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 80(1.121039e-43)
|
|
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T4.Y, T11.X, literal.y,
|
|
; CM-NEXT: LSHR T4.Z, T12.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 64(8.968310e-44)
|
|
; CM-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T5.Y, T11.X, literal.y,
|
|
; CM-NEXT: LSHR T5.Z, T11.Y, literal.z,
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T19.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T6.Y, T11.X, literal.y,
|
|
; CM-NEXT: LSHR T6.Z, T11.Y, literal.z,
|
|
; CM-NEXT: LSHR * T2.W, T11.Z, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; CM-NEXT: BFE_INT T20.X, T11.W, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T7.Y, T11.Y, literal.y,
|
|
; CM-NEXT: LSHR T7.Z, T11.Z, literal.z,
|
|
; CM-NEXT: LSHR * T3.W, T11.W, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T21.X, T11.Z, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T8.Y, T11.Z, literal.y,
|
|
; CM-NEXT: LSHR T8.Z, T11.W, literal.z,
|
|
; CM-NEXT: BFE_INT * T20.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T22.X, T11.Y, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T9.Y, T11.W, literal.x,
|
|
; CM-NEXT: BFE_INT T20.Z, PV.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T21.W, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T11.X, T11.X, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T20.Y, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T21.Z, T7.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T22.W, T7.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T23.X, T12.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T21.Y, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT T22.Z, T6.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T11.W, T6.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T24.X, T12.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T22.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT T11.Z, T5.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T23.W, T4.Z, 0.0, literal.x, BS:VEC_201
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T25.X, T12.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T11.Y, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT T23.Z, T3.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T24.W, T3.Y, 0.0, literal.x, BS:VEC_201
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T26.X, T12.X, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T23.Y, T2.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T24.Z, T2.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T25.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T27.X, T1.Y, literal.x,
|
|
; CM-NEXT: BFE_INT T24.Y, T14.X, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T25.Z, T1.W, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT * T26.W, T0.Z, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T14.X, T0.Y, literal.x,
|
|
; CM-NEXT: BFE_INT T25.Y, T13.X, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T26.Z, T0.W, 0.0, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T12.X, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T12.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T26.Y, PV.W, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
%load = load <32 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <32 x i8> %load to <32 x i32>
|
|
store <32 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v64i8_to_v64i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v13
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v13, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v12
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v12, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 24, v15
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v15, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 24, v14
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v14, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v13
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v13, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v12
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v12, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff, v15
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v18, v15, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xff, v14
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v22, v14, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v3, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v41, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v1, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v34, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, 0xff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v38, v3, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v42, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5) expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v25
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v5, v25, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 24, v24
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v45, v24, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 24, v27
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v49, v27, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 24, v26
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v53, v26, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v25
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v25, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, 0xff, v24
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v46, v24, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, 0xff, v27
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v50, v27, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, 0xff, v26
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v54, v26, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v29
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v29, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 24, v28
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v57, v28, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 24, v31
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v61, v31, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v30
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v1, v30, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xff, v29
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v26, v29, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, 0xff, v28
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v58, v28, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, 0xff, v31
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v62, v31, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v30
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v30, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v64i8_to_v64i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s6, s2, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[4:5]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[4:5]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s6, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0
|
|
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xf0
|
|
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xc0
|
|
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s14, s0, 0xd0
|
|
; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0
|
|
; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s17
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s16
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v17, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v18, v2, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v14
|
|
; GCN-HSA-NEXT: v_bfe_u32 v17, v14, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v14
|
|
; GCN-HSA-NEXT: v_bfe_u32 v18, v14, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v15
|
|
; GCN-HSA-NEXT: v_bfe_u32 v17, v15, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v15
|
|
; GCN-HSA-NEXT: v_bfe_u32 v18, v15, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[16:19]
|
|
; GCN-HSA-NEXT: v_bfe_u32 v15, v12, 8, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v12
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v12
|
|
; GCN-HSA-NEXT: v_bfe_u32 v16, v12, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 24, v13
|
|
; GCN-HSA-NEXT: v_bfe_u32 v15, v13, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v13
|
|
; GCN-HSA-NEXT: v_bfe_u32 v16, v13, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17]
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v10, 8, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s11
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v10
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v10
|
|
; GCN-HSA-NEXT: v_bfe_u32 v14, v10, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s10
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v11
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v11, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v11
|
|
; GCN-HSA-NEXT: v_bfe_u32 v14, v11, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v11, v8, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v8, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s15
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s14
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v9
|
|
; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v9
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v9, 16, 8
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[10:13]
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v3
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[8:11]
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v1
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v18, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v1
|
|
; GCN-HSA-NEXT: v_bfe_u32 v19, v1, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v5
|
|
; GCN-HSA-NEXT: v_bfe_u32 v14, v5, 8, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[8:11]
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v5
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 24, v4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v4, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v15, v5, 16, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[17:20]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v20, 24, v6
|
|
; GCN-HSA-NEXT: v_bfe_u32 v18, v6, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v6
|
|
; GCN-HSA-NEXT: v_bfe_u32 v19, v6, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[17:20]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v7
|
|
; GCN-HSA-NEXT: v_bfe_u32 v1, v7, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v7
|
|
; GCN-HSA-NEXT: v_bfe_u32 v2, v7, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[13:16]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v64i8_to_v64i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[36:39], off, s[8:11], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v17
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v17, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v17
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v17, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v38
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v1, v38, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v38
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v38, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v16
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v5, v16, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 24, v19
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v19, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 24, v18
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v13, v18, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v16
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v16, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, 0xff, v19
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v19, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v18
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v18, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 24, v29
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v17, v29, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 24, v28
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v21, v28, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 24, v31
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v25, v31, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 24, v30
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v41, v30, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v29
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v29, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v28
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v28, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, 0xff, v31
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v26, v31, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, 0xff, v30
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v42, v30, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v33
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v33, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 24, v32
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v45, v32, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 24, v35
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v49, v35, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 24, v34
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v53, v34, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, 0xff, v33
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v30, v33, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, 0xff, v32
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v46, v32, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, 0xff, v35
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v50, v35, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, 0xff, v34
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v54, v34, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 24, v37
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v33, v37, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 24, v36
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v57, v36, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v63, 24, v39
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v61, v39, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, 0xff, v37
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v34, v37, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v56, 0xff, v36
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v58, v36, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, 0xff, v39
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v62, v39, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v64i8_to_v64i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @22
|
|
; EG-NEXT: ALU 59, @31, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @26
|
|
; EG-NEXT: ALU 88, @91, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T32.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T46.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T44.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T42.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T33.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T39.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T37.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T35.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T22.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T31.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T29.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T23.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T24.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 22:
|
|
; EG-NEXT: VTX_READ_128 T22.XYZW, T21.X, 16, #1
|
|
; EG-NEXT: VTX_READ_128 T23.XYZW, T21.X, 0, #1
|
|
; EG-NEXT: Fetch clause starting at 26:
|
|
; EG-NEXT: VTX_READ_128 T32.XYZW, T21.X, 48, #1
|
|
; EG-NEXT: VTX_READ_128 T33.XYZW, T21.X, 32, #1
|
|
; EG-NEXT: ALU clause starting at 30:
|
|
; EG-NEXT: MOV * T21.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 31:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T19.Z, T23.Y, literal.x, PV.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T19.Y, T23.Y, literal.x, T0.W,
|
|
; EG-NEXT: BFE_UINT T20.Z, T23.X, literal.y, T0.W,
|
|
; EG-NEXT: LSHR * T19.W, T23.Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T19.X, T23.Y, literal.x,
|
|
; EG-NEXT: BFE_UINT T20.Y, T23.X, literal.y, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T25.Z, T23.W, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T20.W, T23.X, literal.z,
|
|
; EG-NEXT: AND_INT * T20.X, T23.X, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT * T25.Y, T23.W, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T23.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: BFE_UINT T26.Z, T23.Z, literal.y, T0.W, BS:VEC_021/SCL_122
|
|
; EG-NEXT: LSHR T25.W, T23.W, literal.z,
|
|
; EG-NEXT: AND_INT * T25.X, T23.W, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T26.Y, T23.Z, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T28.Z, T22.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T26.W, T23.Z, literal.z,
|
|
; EG-NEXT: AND_INT * T26.X, T23.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T28.Y, T22.Y, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T30.Z, T22.X, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T28.W, T22.Y, literal.z,
|
|
; EG-NEXT: AND_INT * T28.X, T22.Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T30.Y, T22.X, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 80(1.121039e-43)
|
|
; EG-NEXT: LSHR T31.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T30.W, T22.X, literal.y,
|
|
; EG-NEXT: AND_INT * T30.X, T22.X, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T21.Z, T22.W, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT * T21.Y, T22.W, literal.y, T0.W,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: ALU clause starting at 91:
|
|
; EG-NEXT: BFE_UINT T34.Z, T22.Z, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T21.W, T22.W, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: AND_INT T21.X, T22.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T34.Y, T22.Z, literal.y, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T35.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T36.Z, T33.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T34.W, T22.Z, literal.z,
|
|
; EG-NEXT: AND_INT * T34.X, T22.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T36.Y, T33.Y, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
|
|
; EG-NEXT: LSHR T37.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T38.Z, T33.X, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T36.W, T33.Y, literal.z,
|
|
; EG-NEXT: AND_INT * T36.X, T33.Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T38.Y, T33.X, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 144(2.017870e-43)
|
|
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T40.Z, T33.W, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T38.W, T33.X, literal.z,
|
|
; EG-NEXT: AND_INT * T38.X, T33.X, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T40.Y, T33.W, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 128(1.793662e-43)
|
|
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T41.Z, T33.Z, literal.y, T0.W, BS:VEC_021/SCL_122
|
|
; EG-NEXT: LSHR T40.W, T33.W, literal.z,
|
|
; EG-NEXT: AND_INT * T40.X, T33.W, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T41.Y, T33.Z, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 176(2.466285e-43)
|
|
; EG-NEXT: LSHR T42.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T43.Z, T32.Y, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T41.W, T33.Z, literal.z,
|
|
; EG-NEXT: AND_INT * T41.X, T33.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T43.Y, T32.Y, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 160(2.242078e-43)
|
|
; EG-NEXT: LSHR T44.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T45.Z, T32.X, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T43.W, T32.Y, literal.z,
|
|
; EG-NEXT: AND_INT * T43.X, T32.Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T45.Y, T32.X, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 208(2.914701e-43)
|
|
; EG-NEXT: LSHR T46.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T47.Z, T32.W, literal.y, T0.W,
|
|
; EG-NEXT: LSHR T45.W, T32.X, literal.z,
|
|
; EG-NEXT: AND_INT * T45.X, T32.X, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T47.Y, T32.W, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 192(2.690493e-43)
|
|
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
|
|
; EG-NEXT: BFE_UINT T48.Z, T32.Z, literal.y, T0.W, BS:VEC_021/SCL_122
|
|
; EG-NEXT: LSHR T47.W, T32.W, literal.z,
|
|
; EG-NEXT: AND_INT * T47.X, T32.W, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 255(3.573311e-43)
|
|
; EG-NEXT: BFE_UINT T48.Y, T32.Z, literal.x, T0.W,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 240(3.363116e-43)
|
|
; EG-NEXT: LSHR T49.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T48.W, T32.Z, literal.y,
|
|
; EG-NEXT: AND_INT * T48.X, T32.Z, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v64i8_to_v64i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @22
|
|
; CM-NEXT: ALU 63, @31, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @26
|
|
; CM-NEXT: ALU 95, @95, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T49, T50.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T32.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T48.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T46.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T44.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T33.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T39.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T37.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T20.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T34.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T31.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T29.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T21.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T26.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T24.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 22:
|
|
; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 32, #1
|
|
; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 48, #1
|
|
; CM-NEXT: Fetch clause starting at 26:
|
|
; CM-NEXT: VTX_READ_128 T32.XYZW, T19.X, 0, #1
|
|
; CM-NEXT: VTX_READ_128 T33.XYZW, T19.X, 16, #1
|
|
; CM-NEXT: ALU clause starting at 30:
|
|
; CM-NEXT: MOV * T19.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 31:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T22.Z, T21.Z, literal.x, PV.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T22.Y, T21.Z, literal.x, T0.W,
|
|
; CM-NEXT: BFE_UINT T23.Z, T21.W, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T22.W, T21.Z, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T22.X, T21.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T23.Y, T21.W, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T24.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T25.Z, T21.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T23.W, T21.W, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T23.X, T21.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T25.Y, T21.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 240(3.363116e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T26.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T27.Z, T21.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T25.W, T21.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T25.X, T21.X, literal.x,
|
|
; CM-NEXT: BFE_UINT T27.Y, T21.Y, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T21.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T28.Z, T20.Z, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T27.W, T21.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T27.X, T21.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T28.Y, T20.Z, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 208(2.914701e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T29.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T30.Z, T20.W, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T28.W, T20.Z, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T28.X, T20.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T30.Y, T20.W, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 160(2.242078e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T31.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T19.Z, T20.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T30.W, T20.W, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T30.X, T20.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T19.Y, T20.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 176(2.466285e-43), 0(0.000000e+00)
|
|
; CM-NEXT: ALU clause starting at 95:
|
|
; CM-NEXT: LSHR T34.X, T1.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T35.Z, T20.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T19.W, T20.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T19.X, T20.X, literal.x,
|
|
; CM-NEXT: BFE_UINT T35.Y, T20.Y, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 128(1.793662e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T36.Z, T33.Z, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T35.W, T20.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T35.X, T20.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T36.Y, T33.Z, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 144(2.017870e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T37.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T38.Z, T33.W, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T36.W, T33.Z, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T36.X, T33.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T38.Y, T33.W, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T40.Z, T33.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T38.W, T33.W, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T38.X, T33.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T40.Y, T33.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T41.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T42.Z, T33.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T40.W, T33.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T40.X, T33.X, literal.x,
|
|
; CM-NEXT: BFE_UINT T42.Y, T33.Y, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 64(8.968310e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T33.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T43.Z, T32.Z, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T42.W, T33.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T42.X, T33.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT T43.Y, T32.Z, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T44.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T45.Z, T32.W, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T43.W, T32.Z, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T43.X, T32.Z, literal.x,
|
|
; CM-NEXT: BFE_UINT T45.Y, T32.W, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T46.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T47.Z, T32.X, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T45.W, T32.W, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T45.X, T32.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T47.Y, T32.X, literal.y, T0.W,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T48.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_UINT T49.Z, T32.Y, literal.y, T0.W,
|
|
; CM-NEXT: LSHR * T47.W, T32.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T47.X, T32.X, literal.x,
|
|
; CM-NEXT: BFE_UINT * T49.Y, T32.Y, literal.y, T0.W,
|
|
; CM-NEXT: 255(3.573311e-43), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T32.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: LSHR * T49.W, T32.Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: AND_INT T49.X, T32.Y, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 255(3.573311e-43), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR * T50.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <64 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <64 x i8> %load to <64 x i32>
|
|
store <64 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v64i8_to_v64i32:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s7
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v11
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v11, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v11, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v11, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 24, v10
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v10, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 24, v13
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v13, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v13, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v13, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 24, v12
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v12, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v12, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v12, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 24, v17
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v17, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v29, v17, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v17, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 24, v16
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v16, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v33, v16, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v16, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 24, v19
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v19, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v37, v19, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v19, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 24, v18
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v18, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v41, v18, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v18, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(5)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 24, v21
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v21, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v21, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v21, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 24, v20
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v20, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v45, v20, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v20, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 24, v23
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v23, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v49, v23, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v23, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 24, v22
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v22, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v53, v22, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v22, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 24, v25
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v25, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v21, v25, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v25, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 24, v24
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v24, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v57, v24, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v24, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 24, v27
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v27, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v61, v27, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v27, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 24, v26
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v26, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v26, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v26, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v64i8_to_v64i32:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s2, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 24, v14
|
|
; GCN-HSA-NEXT: v_bfe_i32 v18, v14, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v17, v14, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v16, v14, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v15
|
|
; GCN-HSA-NEXT: v_bfe_i32 v17, v15, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v16, v15, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v15, v15, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s1
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 24, v12
|
|
; GCN-HSA-NEXT: v_bfe_i32 v16, v12, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v15, v12, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v14, v12, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[14:17]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v16, 24, v13
|
|
; GCN-HSA-NEXT: v_bfe_i32 v15, v13, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v14, v13, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v13, 0, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[13:16]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 24, v10
|
|
; GCN-HSA-NEXT: v_bfe_i32 v14, v10, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v10, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[12:15]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v11
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v11, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v11, v11, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 24, v8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v11, v8, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 24, v9
|
|
; GCN-HSA-NEXT: v_bfe_i32 v11, v9, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v10, v9, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v9, 0, 8
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[9:12]
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v6
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v6, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v11, v6, 0, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[11:14]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 24, v7
|
|
; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v7, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v7, v7, 0, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v4
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v7, v4, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(10)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 24, v1
|
|
; GCN-HSA-NEXT: v_bfe_i32 v13, v1, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v12, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v11, v1, 0, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[6:9]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 24, v0
|
|
; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v7, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v0, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v5
|
|
; GCN-HSA-NEXT: v_bfe_i32 v17, v5, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v15, v5, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v18, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_i32 v17, v3, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v16, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v15, v3, 0, 8
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 24, v2
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x50
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[6:9]
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v64i8_to_v64i32:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s11
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[30:33], off, s[8:11], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[34:37], off, s[8:11], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v11
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v11, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v11, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v11, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[88:91], 0 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 24, v36
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v36, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v36, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v36, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 24, v10
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v10, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v5, v10, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 24, v13
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v13, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v13, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v13, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 24, v12
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v12, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v12, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 24, v27
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v27, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v17, v27, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v27, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 24, v26
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v26, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v21, v26, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 24, v29
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v29, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v25, v29, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v29, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 24, v28
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v28, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v39, v28, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v28, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 24, v31
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v31, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v43, v31, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v31, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 24, v30
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v30, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v47, v30, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v30, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 24, v33
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v33, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v29, v33, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v33, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 24, v32
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v32, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v51, v32, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v32, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 24, v35
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v35, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v55, v35, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v35, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v61, 24, v34
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v60, v34, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v59, v34, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v34, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 24, v37
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v37, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v33, v37, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v37, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v64i8_to_v64i32:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @32, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @24
|
|
; EG-NEXT: ALU 41, @33, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @28
|
|
; EG-NEXT: ALU 76, @75, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ALU 72, @152, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T50.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T49.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T19.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T35.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T34.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T33.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T32.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T30.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T29.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T28.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T27.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T26.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T25.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T24.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T23.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T22.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 24:
|
|
; EG-NEXT: VTX_READ_128 T20.XYZW, T21.X, 32, #1
|
|
; EG-NEXT: VTX_READ_128 T19.XYZW, T21.X, 48, #1
|
|
; EG-NEXT: Fetch clause starting at 28:
|
|
; EG-NEXT: VTX_READ_128 T31.XYZW, T21.X, 0, #1
|
|
; EG-NEXT: VTX_READ_128 T21.XYZW, T21.X, 16, #1
|
|
; EG-NEXT: ALU clause starting at 32:
|
|
; EG-NEXT: MOV * T21.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 33:
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR * T23.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
|
|
; EG-NEXT: LSHR T26.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
|
|
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T0.Y, T19.Z, literal.y,
|
|
; EG-NEXT: LSHR T0.Z, T19.W, literal.z,
|
|
; EG-NEXT: LSHR * T0.W, T19.Z, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T1.Y, T19.W, literal.y,
|
|
; EG-NEXT: LSHR T1.Z, T19.X, literal.z,
|
|
; EG-NEXT: LSHR * T1.W, T19.W, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 144(2.017870e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR T2.Y, T19.X, literal.y,
|
|
; EG-NEXT: LSHR T2.Z, T19.X, literal.z,
|
|
; EG-NEXT: LSHR T2.W, T19.Y, literal.y,
|
|
; EG-NEXT: LSHR * T3.W, T19.Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 75:
|
|
; EG-NEXT: LSHR T3.Y, T20.Z, literal.x,
|
|
; EG-NEXT: LSHR T3.Z, T20.Z, literal.y,
|
|
; EG-NEXT: LSHR T4.W, T20.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T5.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T32.X, PS, literal.x,
|
|
; EG-NEXT: LSHR T4.Y, T20.W, literal.y,
|
|
; EG-NEXT: LSHR T4.Z, T20.X, literal.z,
|
|
; EG-NEXT: LSHR T5.W, T20.X, literal.y,
|
|
; EG-NEXT: ADD_INT * T6.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 176(2.466285e-43)
|
|
; EG-NEXT: LSHR T33.X, PS, literal.x,
|
|
; EG-NEXT: LSHR T5.Y, T20.Y, literal.y,
|
|
; EG-NEXT: LSHR T5.Z, T20.Y, literal.z,
|
|
; EG-NEXT: LSHR T6.W, T21.Z, literal.y,
|
|
; EG-NEXT: ADD_INT * T7.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 160(2.242078e-43)
|
|
; EG-NEXT: LSHR T34.X, PS, literal.x,
|
|
; EG-NEXT: LSHR T6.Y, T21.Z, literal.y,
|
|
; EG-NEXT: LSHR T6.Z, T21.W, literal.z,
|
|
; EG-NEXT: LSHR T7.W, T21.W, literal.y,
|
|
; EG-NEXT: ADD_INT * T8.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 208(2.914701e-43)
|
|
; EG-NEXT: LSHR T35.X, PS, literal.x,
|
|
; EG-NEXT: LSHR T7.Y, T21.X, literal.y,
|
|
; EG-NEXT: LSHR T7.Z, T21.X, literal.z,
|
|
; EG-NEXT: LSHR T8.W, T21.Y, literal.y,
|
|
; EG-NEXT: LSHR * T9.W, T21.Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T36.X, T31.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T8.Y, T31.Z, literal.y,
|
|
; EG-NEXT: LSHR T8.Z, T31.Z, literal.z,
|
|
; EG-NEXT: LSHR T10.W, T31.W, literal.y,
|
|
; EG-NEXT: LSHR * T11.W, T31.Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T37.X, T31.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T9.Y, T31.W, literal.y,
|
|
; EG-NEXT: LSHR T9.Z, T31.X, literal.y,
|
|
; EG-NEXT: BFE_INT T36.W, PS, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T11.W, T31.Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T38.X, T31.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T10.Y, T31.X, literal.y,
|
|
; EG-NEXT: BFE_INT T36.Z, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T37.W, PV.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T11.W, T31.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T39.X, T31.Z, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T36.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T37.Z, PV.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T38.W, T9.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T11.W, T31.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T31.X, T21.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T37.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T38.Z, T10.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T39.W, T8.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T10.W, T31.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T40.X, T21.X, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T38.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T39.Z, T8.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T31.W, T9.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T9.W, T31.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T41.X, T21.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T39.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T31.Z, T8.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: BFE_INT * T40.W, T7.Z, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 152:
|
|
; EG-NEXT: LSHR * T8.W, T21.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T42.X, T21.Z, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T31.Y, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T40.Z, T7.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T41.W, T7.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T7.W, T21.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T21.X, T20.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T40.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T41.Z, T6.Z, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T42.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: LSHR * T7.W, T21.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T43.X, T20.X, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T41.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T42.Z, T6.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T21.W, T5.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T6.W, T21.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T44.X, T20.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T21.Z, T5.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T43.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: LSHR * T5.W, T20.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T45.X, T20.Z, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T21.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T43.Z, T4.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T5.W, T20.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T20.X, T19.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T44.Z, T4.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T45.W, T3.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T4.W, T20.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T46.X, T19.X, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T45.Z, T3.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T20.W, T3.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T3.W, T20.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T47.X, T19.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T20.Z, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: BFE_INT T46.W, T2.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T2.W, T19.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T48.X, T19.Z, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T20.Y, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T46.Z, T2.Y, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T47.W, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 192(2.690493e-43)
|
|
; EG-NEXT: LSHR T19.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T46.Y, T1.Z, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T47.Z, T1.Y, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T48.W, T0.W, 0.0, literal.y,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 240(3.363116e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T49.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T47.Y, T0.Z, 0.0, literal.y,
|
|
; EG-NEXT: BFE_INT T48.Z, T0.Y, 0.0, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T19.Z, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T50.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT * T48.Y, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v64i8_to_v64i32:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @32, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @24
|
|
; CM-NEXT: ALU 39, @33, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @28
|
|
; CM-NEXT: ALU 84, @73, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: ALU 73, @158, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T50.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T21.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T49, T23.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T48, T38.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T37.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T36.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T46, T35.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T34.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T33.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T32.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T31.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T30.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T29.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T27.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T26.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T39, T25.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 24:
|
|
; CM-NEXT: VTX_READ_128 T19.XYZW, T22.X, 0, #1
|
|
; CM-NEXT: VTX_READ_128 T20.XYZW, T22.X, 16, #1
|
|
; CM-NEXT: Fetch clause starting at 28:
|
|
; CM-NEXT: VTX_READ_128 T28.XYZW, T22.X, 48, #1
|
|
; CM-NEXT: VTX_READ_128 T22.XYZW, T22.X, 32, #1
|
|
; CM-NEXT: ALU clause starting at 32:
|
|
; CM-NEXT: MOV * T22.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 33:
|
|
; CM-NEXT: LSHR T0.Y, T19.Y, literal.x,
|
|
; CM-NEXT: LSHR T0.Z, T19.Y, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T19.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T21.X, T19.Y, literal.x,
|
|
; CM-NEXT: LSHR T1.Y, T19.X, literal.y,
|
|
; CM-NEXT: LSHR T1.Z, T19.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T23.X, T19.X, literal.x,
|
|
; CM-NEXT: LSHR T2.Y, T19.W, literal.y,
|
|
; CM-NEXT: LSHR T2.Z, T19.Z, literal.z,
|
|
; CM-NEXT: LSHR * T2.W, T19.W, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T24.X, T19.Z, literal.x,
|
|
; CM-NEXT: LSHR T3.Y, T20.Y, literal.y,
|
|
; CM-NEXT: LSHR T3.Z, T19.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 224(3.138909e-43)
|
|
; CM-NEXT: LSHR T25.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T4.Y, T20.Y, literal.y,
|
|
; CM-NEXT: LSHR T4.Z, T20.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 240(3.363116e-43)
|
|
; CM-NEXT: LSHR T26.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T5.Y, T20.Y, literal.y,
|
|
; CM-NEXT: LSHR T5.Z, T20.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 192(2.690493e-43)
|
|
; CM-NEXT: LSHR T27.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T6.Y, T20.W, literal.y,
|
|
; CM-NEXT: LSHR T6.Z, T20.X, literal.z,
|
|
; CM-NEXT: LSHR * T3.W, T20.W, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; CM-NEXT: ALU clause starting at 73:
|
|
; CM-NEXT: LSHR T7.Z, T20.Z, literal.x,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 208(2.914701e-43)
|
|
; CM-NEXT: LSHR T29.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T7.Y, T20.W, literal.y,
|
|
; CM-NEXT: LSHR T8.Z, T20.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 160(2.242078e-43)
|
|
; CM-NEXT: LSHR T30.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T8.Y, T22.Y, literal.y,
|
|
; CM-NEXT: LSHR T9.Z, T20.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 176(2.466285e-43)
|
|
; CM-NEXT: LSHR T31.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T9.Y, T22.Y, literal.y,
|
|
; CM-NEXT: LSHR T10.Z, T22.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 128(1.793662e-43)
|
|
; CM-NEXT: LSHR T32.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T10.Y, T22.Y, literal.y,
|
|
; CM-NEXT: LSHR T11.Z, T22.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 144(2.017870e-43)
|
|
; CM-NEXT: LSHR T33.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T11.Y, T22.W, literal.y,
|
|
; CM-NEXT: LSHR T12.Z, T22.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 96(1.345247e-43)
|
|
; CM-NEXT: LSHR T34.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T12.Y, T22.W, literal.y,
|
|
; CM-NEXT: LSHR T13.Z, T22.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 112(1.569454e-43)
|
|
; CM-NEXT: LSHR T35.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T13.Y, T22.W, literal.y,
|
|
; CM-NEXT: LSHR T14.Z, T22.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 64(8.968310e-44)
|
|
; CM-NEXT: LSHR T36.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T14.Y, T28.Y, literal.y,
|
|
; CM-NEXT: LSHR T15.Z, T22.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 80(1.121039e-43)
|
|
; CM-NEXT: LSHR T37.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T15.Y, T28.Y, literal.y,
|
|
; CM-NEXT: LSHR T16.Z, T28.X, literal.z,
|
|
; CM-NEXT: ADD_INT * T4.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; CM-NEXT: LSHR T38.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T16.Y, T28.Y, literal.y,
|
|
; CM-NEXT: LSHR T17.Z, T28.X, literal.z,
|
|
; CM-NEXT: LSHR * T4.W, T28.W, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
|
|
; CM-NEXT: BFE_INT T39.X, T28.Z, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T17.Y, T28.X, literal.y,
|
|
; CM-NEXT: LSHR T18.Z, T28.W, literal.z,
|
|
; CM-NEXT: LSHR * T5.W, T28.Z, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T40.X, T28.W, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T18.Y, T28.W, literal.y,
|
|
; CM-NEXT: LSHR T21.Z, T28.Z, literal.z,
|
|
; CM-NEXT: BFE_INT * T39.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T28.X, T28.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T21.Y, T28.Z, literal.x,
|
|
; CM-NEXT: BFE_INT T39.Z, PV.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T40.W, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T41.X, T28.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T39.Y, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T40.Z, T18.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T28.W, T17.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: ALU clause starting at 158:
|
|
; CM-NEXT: BFE_INT T42.X, T22.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T40.Y, T4.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T28.Z, T17.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT * T41.W, T16.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T43.X, T22.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T28.Y, T16.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T41.Z, T15.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T42.W, T15.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T22.X, T22.X, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T41.Y, T14.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T42.Z, T14.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T43.W, T13.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T44.X, T22.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T42.Y, T13.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T43.Z, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT * T22.W, T12.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T45.X, T20.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T43.Y, T11.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T22.Z, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT * T44.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T46.X, T20.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T22.Y, T10.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T44.Z, T9.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T45.W, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T20.X, T20.X, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T44.Y, T8.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T45.Z, T8.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T46.W, T7.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T47.X, T20.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T45.Y, T7.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T46.Z, T3.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T20.W, T6.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T48.X, T19.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T46.Y, T6.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T20.Z, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT * T47.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T49.X, T19.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T20.Y, T4.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T47.Z, T4.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T48.W, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T19.X, T19.X, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T47.Y, T3.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T48.Z, T24.X, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT * T49.W, T2.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T24.X, T19.Y, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T48.Y, T2.Z, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT T49.Z, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: BFE_INT * T19.W, T23.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T23.X, T1.W, literal.x,
|
|
; CM-NEXT: BFE_INT T49.Y, T1.Z, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T19.Z, T1.Y, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT * T24.W, T21.X, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT T19.Y, T0.W, 0.0, literal.y,
|
|
; CM-NEXT: BFE_INT T24.Z, T0.Z, 0.0, literal.y,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T50.X, PV.W, literal.x,
|
|
; CM-NEXT: BFE_INT * T24.Y, T0.Y, 0.0, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
%load = load <64 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <64 x i8> %load to <64 x i32>
|
|
store <64 x i32> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_i8_to_i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_i8_to_i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_i8_to_i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_i8_to_i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T0.Y, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_i8_to_i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T0.Y, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%a = load i8, ptr addrspace(1) %in
|
|
%ext = zext i8 %a to i64
|
|
store i64 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG Why 31 not 7 ?
|
|
define amdgpu_kernel void @global_sextload_i8_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_i8_to_i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_i8_to_i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_sbyte v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_i8_to_i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_i8_to_i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
|
|
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
|
|
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_sextload_i8_to_i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR * T0.Y, PV.X, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%a = load i8, ptr addrspace(1) %in
|
|
%ext = sext i8 %a to i64
|
|
store i64 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v1i8_to_v1i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v1i8_to_v1i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v1i8_to_v1i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v1i8_to_v1i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T0.Y, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v1i8_to_v1i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T0.Y, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <1 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <1 x i8> %load to <1 x i64>
|
|
store <1 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; TODO: EG Why 31 not 7 ?
|
|
define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v1i8_to_v1i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v1i8_to_v1i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_sbyte v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v1i8_to_v1i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v1i8_to_v1i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45)
|
|
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
|
|
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_sextload_v1i8_to_v1i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR * T0.Y, PV.X, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%load = load <1 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <1 x i8> %load to <1 x i64>
|
|
store <1 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v2i8_to_v2i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v2i8_to_v2i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xff, v0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v2i8_to_v2i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v2i8_to_v2i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 14, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T2.X,
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T4.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T2.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: MOV * T1.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT * T4.Z, PV.Y, literal.x, PV.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T4.X, T0.W, literal.x,
|
|
; EG-NEXT: MOV T4.Y, 0.0,
|
|
; EG-NEXT: MOV T4.W, 0.0,
|
|
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), 2(2.802597e-45)
|
|
;
|
|
; CM-LABEL: global_zextload_v2i8_to_v2i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 15, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T2.X,
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T2.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: MOV * T1.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T4.Z, PV.Y, literal.x, PV.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T4.X, T0.W, literal.x,
|
|
; CM-NEXT: MOV T4.Y, 0.0,
|
|
; CM-NEXT: MOV * T4.W, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <2 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <2 x i8> %load to <2 x i64>
|
|
store <2 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v2i8_to_v2i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v2i8_to_v2i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v0, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v2i8_to_v2i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v2i8_to_v2i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 15, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T2.X,
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T4.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T2.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, PV.X,
|
|
; EG-NEXT: BFE_INT * T4.X, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T4.Y, PV.X, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T0.Y, literal.y,
|
|
; EG-NEXT: 31(4.344025e-44), 8(1.121039e-44)
|
|
; EG-NEXT: BFE_INT * T4.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ASHR * T4.W, PV.Z, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v2i8_to_v2i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 15, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T2.X,
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T4.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T2.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, PV.X,
|
|
; CM-NEXT: BFE_INT * T4.X, T0.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: ASHR T4.Y, PV.X, literal.x,
|
|
; CM-NEXT: LSHR * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 31(4.344025e-44), 8(1.121039e-44)
|
|
; CM-NEXT: BFE_INT * T4.Z, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR * T4.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%load = load <2 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <2 x i8> %load to <2 x i64>
|
|
store <2 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v4i8_to_v4i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v0, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_bfe_u32 v6, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff, v0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v0, v0, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v0, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v4i8_to_v4i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T7.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T6.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T5.X, T4.X, literal.x, PV.W,
|
|
; EG-NEXT: LSHR * T5.Z, T4.X, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T5.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T4.Z, T4.X, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
|
|
; EG-NEXT: MOV T4.Y, 0.0,
|
|
; EG-NEXT: MOV T5.W, 0.0,
|
|
; EG-NEXT: MOV * T4.W, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR * T7.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v4i8_to_v4i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T4.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T5.Z, T4.X, literal.x, PV.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T5.X, T4.X, literal.x,
|
|
; CM-NEXT: MOV * T5.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T6.X, T4.X, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T6.Z, T4.X, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T6.Y, 0.0,
|
|
; CM-NEXT: MOV * T5.W, 0.0,
|
|
; CM-NEXT: MOV * T6.W, 0.0,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T4.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <4 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <4 x i8> %load to <4 x i64>
|
|
store <4 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v4i8_to_v4i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v4, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v3, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v4i8_to_v4i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-HSA-NEXT: v_bfe_i32 v6, v4, 0, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v4i8_to_v4i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v4, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v3, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v4i8_to_v4i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T7.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T4.W, T4.X, literal.y,
|
|
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T5.Y, PV.X, literal.x,
|
|
; EG-NEXT: ASHR T4.Z, T4.X, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T4.X, literal.z,
|
|
; EG-NEXT: LSHR * T1.W, T4.X, literal.w,
|
|
; EG-NEXT: 31(4.344025e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T4.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT T5.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T7.X, PV.W, literal.x,
|
|
; EG-NEXT: ASHR T4.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR * T5.W, PV.Z, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v4i8_to_v4i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T7.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T6.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T4.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T0.Y, T4.X, literal.x,
|
|
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
|
|
; CM-NEXT: ASHR * T4.W, T4.X, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T6.X, PV.Z, literal.x,
|
|
; CM-NEXT: ASHR T5.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T4.Z, T4.X, literal.z,
|
|
; CM-NEXT: LSHR * T0.W, T4.X, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; CM-NEXT: BFE_INT T4.X, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T5.Z, T0.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR T4.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR * T5.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%load = load <4 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <4 x i8> %load to <4 x i64>
|
|
store <4 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[16:17], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v16
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v17
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v10, v17, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v14, v16, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v16
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v17
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v4, v16, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v0, v17, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[15:16], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v6, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v16
|
|
; GCN-HSA-NEXT: v_bfe_u32 v0, v16, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s1
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v15
|
|
; GCN-HSA-NEXT: v_bfe_u32 v3, v15, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v16, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v15, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v15
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v16
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[3:6]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[7:10]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v8i8_to_v8i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[16:17], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v1
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v17
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v0, v17, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v17, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v16
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v16, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v16
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v8, v16, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v17
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v8i8_to_v8i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @8
|
|
; EG-NEXT: ALU 34, @11, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T12.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T10.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T9.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 8:
|
|
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 11:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T6.X, T5.Y, literal.x, PV.W,
|
|
; EG-NEXT: LSHR * T6.Z, T5.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T6.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T7.Z, T5.Y, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T7.X, T5.Y, literal.x,
|
|
; EG-NEXT: MOV * T7.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T8.X, T5.X, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T8.Z, T5.X, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T8.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T5.Z, T5.X, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T5.X, T5.X, literal.x,
|
|
; EG-NEXT: MOV T5.Y, 0.0,
|
|
; EG-NEXT: MOV T6.W, 0.0,
|
|
; EG-NEXT: MOV * T7.W, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T8.W, 0.0,
|
|
; EG-NEXT: MOV * T5.W, 0.0,
|
|
; EG-NEXT: LSHR T9.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T10.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T11.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v8i8_to_v8i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @8
|
|
; CM-NEXT: ALU 35, @11, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T12.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T11.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T9.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 8:
|
|
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 11:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T6.Z, T5.X, literal.x, PV.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T6.X, T5.X, literal.x,
|
|
; CM-NEXT: MOV * T6.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T7.X, T5.X, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T7.Z, T5.X, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T7.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T8.Z, T5.Y, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T8.X, T5.Y, literal.x,
|
|
; CM-NEXT: MOV * T8.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T5.X, T5.Y, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T5.Z, T5.Y, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T5.Y, 0.0,
|
|
; CM-NEXT: MOV * T6.W, 0.0,
|
|
; CM-NEXT: MOV * T7.W, 0.0,
|
|
; CM-NEXT: MOV * T8.W, 0.0,
|
|
; CM-NEXT: MOV * T5.W, 0.0,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; CM-NEXT: LSHR T10.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR * T11.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T12.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <8 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <8 x i8> %load to <8 x i64>
|
|
store <8 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v8i8_to_v8i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v1
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s4, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s5, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s5, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 8
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s4, 31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s4, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s18
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s15
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s16
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s17
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s6
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s10
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s11
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s12
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s13
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GCN-HSA-NEXT: s_lshr_b32 s4, s2, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s6, s3, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s8, s3, 24
|
|
; GCN-HSA-NEXT: s_lshr_b32 s10, s3, 8
|
|
; GCN-HSA-NEXT: s_lshr_b32 s12, s2, 8
|
|
; GCN-HSA-NEXT: s_ashr_i32 s13, s2, 31
|
|
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[2:3], 0x80000
|
|
; GCN-HSA-NEXT: s_ashr_i32 s16, s2, 24
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[12:13], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s15
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 8
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s15, s4, 31
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s18, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[4:5], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[14:15], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s18
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s6
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s7
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s13
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v8i8_to_v8i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @8
|
|
; EG-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T12.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T9.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T6.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 8:
|
|
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 11:
|
|
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T8.X, PV.W, literal.x,
|
|
; EG-NEXT: ASHR T7.Y, PV.X, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T5.Y, literal.z,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T9.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: ASHR * T10.W, T5.X, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T11.X, T5.X, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T10.Z, T5.X, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T5.X, literal.z,
|
|
; EG-NEXT: ASHR * T5.W, T5.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T10.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T5.Z, T5.Y, literal.z,
|
|
; EG-NEXT: LSHR T0.W, T5.X, literal.x,
|
|
; EG-NEXT: LSHR * T1.W, T5.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T5.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T10.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T11.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
|
|
; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T11.W, PV.Z, literal.y,
|
|
; EG-NEXT: ASHR * T7.W, T7.Z, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v8i8_to_v8i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @8
|
|
; CM-NEXT: ALU 39, @11, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T12.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T9.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T6.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 8:
|
|
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 11:
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T6.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T7.X, T5.Y, 0.0, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 32(4.484155e-44)
|
|
; CM-NEXT: LSHR T8.X, PV.W, literal.x,
|
|
; CM-NEXT: ASHR T7.Y, PV.X, literal.y,
|
|
; CM-NEXT: LSHR T0.Z, T5.Y, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T0.Y, T5.X, literal.y,
|
|
; CM-NEXT: BFE_INT T7.Z, PV.Z, 0.0, literal.y,
|
|
; CM-NEXT: ASHR * T10.W, T5.Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T11.X, T5.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T1.Y, T5.Y, literal.y,
|
|
; CM-NEXT: ASHR T10.Z, T5.Y, literal.z,
|
|
; CM-NEXT: ASHR * T5.W, T5.X, literal.w,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T10.X, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T11.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T5.Z, T5.X, literal.z,
|
|
; CM-NEXT: LSHR * T0.W, T5.X, literal.w,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; CM-NEXT: BFE_INT T5.X, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T10.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T11.Z, T0.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T7.W, T7.Z, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: LSHR T12.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR T5.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR * T11.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%load = load <8 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <8 x i8> %load to <8 x i64>
|
|
store <8 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v16i8_to_v16i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, 0
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v9, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v13, v1, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v17, v3, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v21, v2, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v1, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v27, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v31, v3, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v2, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, 0
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v5
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_u32 v8, v3, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11]
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
|
|
; GCN-HSA-NEXT: v_bfe_u32 v11, v0, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 64
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 24, v1
|
|
; GCN-HSA-NEXT: v_bfe_u32 v14, v1, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[14:17]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[8:11]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 24, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v4, v2, 16, 8
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, v5
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[4:7]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v0, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v17, v1, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v15, 0xff, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[11:14]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[7:10]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, v29
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v29
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 24, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v3, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v6, v1, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v14, v0, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff, v0
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v8, v0, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, 0xff, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v18, v3, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, 0xff, v3
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v2, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, 0xff, v2
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v24, v2, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v1
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v0, v1, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v29
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v16i8_to_v16i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @12
|
|
; EG-NEXT: ALU 68, @15, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T22.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T21.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T20.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T18.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T17.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T15.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 12:
|
|
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 14:
|
|
; EG-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 15:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T8.X, T7.W, literal.x, PV.W,
|
|
; EG-NEXT: LSHR * T8.Z, T7.W, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T8.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T9.Z, T7.W, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T9.X, T7.W, literal.x,
|
|
; EG-NEXT: MOV * T9.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T10.X, T7.Z, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T10.Z, T7.Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T10.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T11.Z, T7.Z, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T11.X, T7.Z, literal.x,
|
|
; EG-NEXT: MOV * T11.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T12.X, T7.Y, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T12.Z, T7.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T12.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T13.Z, T7.Y, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T13.X, T7.Y, literal.x,
|
|
; EG-NEXT: MOV * T13.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T14.X, T7.X, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T14.Z, T7.X, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T14.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T7.Z, T7.X, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T7.X, T7.X, literal.x,
|
|
; EG-NEXT: MOV T7.Y, 0.0,
|
|
; EG-NEXT: MOV T8.W, 0.0,
|
|
; EG-NEXT: MOV * T9.W, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T10.W, 0.0,
|
|
; EG-NEXT: MOV * T11.W, 0.0,
|
|
; EG-NEXT: MOV T12.W, 0.0,
|
|
; EG-NEXT: MOV * T13.W, 0.0,
|
|
; EG-NEXT: MOV T14.W, 0.0,
|
|
; EG-NEXT: MOV * T7.W, 0.0,
|
|
; EG-NEXT: LSHR T15.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR T19.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
|
|
; EG-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
|
|
; EG-NEXT: LSHR T21.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
|
|
; EG-NEXT: LSHR * T22.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v16i8_to_v16i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @12
|
|
; CM-NEXT: ALU 69, @15, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T22.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T21.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T20.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T19.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T18.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T17.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T16.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T15.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 12:
|
|
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 14:
|
|
; CM-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 15:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T8.Z, T7.X, literal.x, PV.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T8.X, T7.X, literal.x,
|
|
; CM-NEXT: MOV * T8.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T9.X, T7.X, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T9.Z, T7.X, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T9.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T10.Z, T7.Y, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T10.X, T7.Y, literal.x,
|
|
; CM-NEXT: MOV * T10.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T11.X, T7.Y, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T11.Z, T7.Y, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T11.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T12.Z, T7.Z, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T12.X, T7.Z, literal.x,
|
|
; CM-NEXT: MOV * T12.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T7.X, T7.Z, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T7.Z, T7.Z, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T7.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T13.Z, T7.W, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T13.X, T7.W, literal.x,
|
|
; CM-NEXT: MOV * T13.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T14.X, T7.W, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T14.Z, T7.W, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T14.Y, 0.0,
|
|
; CM-NEXT: MOV * T8.W, 0.0,
|
|
; CM-NEXT: MOV * T9.W, 0.0,
|
|
; CM-NEXT: MOV * T10.W, 0.0,
|
|
; CM-NEXT: MOV * T11.W, 0.0,
|
|
; CM-NEXT: MOV * T12.W, 0.0,
|
|
; CM-NEXT: MOV * T7.W, 0.0,
|
|
; CM-NEXT: MOV * T13.W, 0.0,
|
|
; CM-NEXT: MOV * T14.W, 0.0,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
|
|
; CM-NEXT: LSHR T16.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
|
|
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; CM-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T19.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; CM-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR * T21.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T22.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <16 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <16 x i8> %load to <16 x i64>
|
|
store <16 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v16i8_to_v16i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s12, s4, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s4, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s4, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s6, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s6, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s22, s6, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s5, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s5, 8
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s26, s5
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s7, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s7, 8
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s7
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s29, s7, 31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s31, s7, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s5, 31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s38, s5, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[30:31], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[28:29], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s38
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s33
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s26
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s27
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s12
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s13
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s14
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s15
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s31
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s29
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s16
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s17
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s18
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s20
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s21
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s22
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s23
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s24
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s25
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s10
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s11
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s6
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-HSA-NEXT: s_lshr_b32 s6, s2, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s8, s2, 24
|
|
; GCN-HSA-NEXT: s_lshr_b32 s10, s2, 8
|
|
; GCN-HSA-NEXT: s_lshr_b32 s18, s3, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s20, s3, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 s22, s3
|
|
; GCN-HSA-NEXT: s_ashr_i32 s7, s3, 31
|
|
; GCN-HSA-NEXT: s_ashr_i32 s9, s3, 24
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
|
|
; GCN-HSA-NEXT: s_lshr_b32 s12, s4, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s14, s4, 24
|
|
; GCN-HSA-NEXT: s_lshr_b32 s16, s4, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x80000
|
|
; GCN-HSA-NEXT: s_ashr_i32 s4, s5, 24
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN-HSA-NEXT: s_lshr_b32 s2, s5, 16
|
|
; GCN-HSA-NEXT: s_ashr_i32 s3, s5, 31
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
|
|
; GCN-HSA-NEXT: s_lshr_b32 s4, s5, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 s24, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s6
|
|
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x50
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[2:5]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s7
|
|
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
|
|
; GCN-HSA-NEXT: s_add_u32 s6, s0, 64
|
|
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s6
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[12:15]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s12
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s13
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s14
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s15
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s18
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s19
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s20
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s21
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s4, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s20, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s6, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s6, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s5
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s28, s7, 8
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s30, s7
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[4:5], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s29, s7, 31
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s31, s7, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s14
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s15
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s17
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s5, 31
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s38, s5, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[30:31], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[28:29], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s36
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s37
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s34
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s35
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s19
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s20
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s21
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s22
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s23
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s24
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s25
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s38
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s33
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s31
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s26
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s27
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v12, s8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, s9
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_nop 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, s6
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, s7
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v18, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, s5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v16i8_to_v16i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @12
|
|
; EG-NEXT: ALU 78, @15, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T22.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T16.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T15.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T12.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T11.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T10.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T9.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T8.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 12:
|
|
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 14:
|
|
; EG-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 15:
|
|
; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T9.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T10.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T11.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR * T12.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT * T13.X, T7.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T14.X, T7.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T13.Y, PV.X, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T7.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T15.X, PS, literal.x,
|
|
; EG-NEXT: ASHR T14.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T13.Z, PV.W, 0.0, literal.z,
|
|
; EG-NEXT: LSHR T0.W, T7.Y, literal.z,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 96(1.345247e-43)
|
|
; EG-NEXT: LSHR T16.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T14.Z, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: ASHR * T17.W, T7.X, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T18.X, T7.X, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T17.Z, T7.X, literal.y,
|
|
; EG-NEXT: LSHR T0.W, T7.X, literal.z,
|
|
; EG-NEXT: ASHR * T19.W, T7.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T17.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T18.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T19.Z, T7.Y, literal.z,
|
|
; EG-NEXT: LSHR T0.W, T7.X, literal.x,
|
|
; EG-NEXT: LSHR * T1.W, T7.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T19.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T17.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T18.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: ASHR * T20.W, T7.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T7.X, T7.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T20.Z, T7.Z, literal.z,
|
|
; EG-NEXT: LSHR T1.W, T7.Z, literal.w,
|
|
; EG-NEXT: ASHR * T21.W, T7.W, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T20.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T7.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T21.Z, T7.W, literal.z,
|
|
; EG-NEXT: LSHR T1.W, T7.Z, literal.x,
|
|
; EG-NEXT: LSHR * T2.W, T7.W, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T21.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T7.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T18.W, T18.Z, literal.y,
|
|
; EG-NEXT: ASHR * T14.W, T14.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: LSHR T22.X, T0.W, literal.x,
|
|
; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T7.W, PV.Z, literal.y,
|
|
; EG-NEXT: ASHR * T13.W, T13.Z, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v16i8_to_v16i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @12
|
|
; CM-NEXT: ALU 79, @15, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T22.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T16.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T15.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T12.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T18, T11.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T10.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T9.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T8.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 12:
|
|
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 14:
|
|
; CM-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 15:
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T8.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
|
|
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
|
|
; CM-NEXT: LSHR T10.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; CM-NEXT: LSHR T11.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T12.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T0.Z, T7.X, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T7.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T13.X, T7.Y, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T0.Y, T7.Y, literal.y,
|
|
; CM-NEXT: LSHR T1.Z, T7.Z, literal.x,
|
|
; CM-NEXT: LSHR * T1.W, T7.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: BFE_INT T14.X, T7.W, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T13.Y, PV.X, literal.y,
|
|
; CM-NEXT: LSHR T2.Z, T7.Y, literal.x,
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; CM-NEXT: ASHR T14.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T13.Z, PV.Z, 0.0, literal.z,
|
|
; CM-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T16.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T1.Y, T7.Z, literal.y,
|
|
; CM-NEXT: BFE_INT T14.Z, T1.W, 0.0, literal.z,
|
|
; CM-NEXT: ASHR * T17.W, T7.W, literal.w, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T18.X, T7.Z, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T2.Y, T7.W, literal.y,
|
|
; CM-NEXT: ASHR T17.Z, T7.W, literal.z,
|
|
; CM-NEXT: ASHR * T19.W, T7.Z, literal.w,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T17.X, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T18.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T19.Z, T7.Z, literal.z,
|
|
; CM-NEXT: ASHR * T14.W, T14.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T19.X, T1.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T17.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T18.Z, T1.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T20.W, T7.Y, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T21.X, T7.X, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T19.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T20.Z, T7.Y, literal.z,
|
|
; CM-NEXT: ASHR * T7.W, T7.X, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T20.X, T0.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T21.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T7.Z, T7.X, literal.z,
|
|
; CM-NEXT: ASHR * T18.W, T18.Z, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T7.X, T0.W, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T20.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T21.Z, T0.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T13.W, T13.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR T7.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR * T21.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%load = load <16 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <16 x i8> %load to <16 x i64>
|
|
store <16 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v32i8_to_v32i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000
|
|
; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s11
|
|
; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[17:20], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v2, v12, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v3, v11, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v6, v10, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff, v10
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v11
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v12
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v2
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v55, v13, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v53, 0xff, v13
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 24, v10
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v25, v10, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 24, v11
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v29, v11, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 24, v12
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v33, v12, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 24, v13
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v37, v13, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 24, v20
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v43, v17, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v41, 0xff, v17
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v47, v18, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v45, 0xff, v18
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v51, v19, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v49, 0xff, v19
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v23, v20, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v21, 0xff, v20
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 24, v17
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v57, v17, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v18
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v4, v18, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v19
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v0, v19, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_u32 v8, v20, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v58, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v60, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v38, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v40, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v36, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v50, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v52, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v46, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v48, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v42, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v44, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v53, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v54, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v55, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v56, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v9
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v14, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v15, off, s[12:15], 0 offset:52 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v16, off, s[12:15], 0 offset:56 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v17, off, s[12:15], 0 offset:60 ; 4-byte Folded Spill
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(3)
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2)
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1) expcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v14
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v12
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v9
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v53, off, s[12:15], 0 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v54, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v55, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v56, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v54, v9
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v56, v9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:52 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:56 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:60 ; 4-byte Folded Reload
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[53:56], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[2:5], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 48
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v4, 16, 8
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v3, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xf0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v2
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v2, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v9
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v9, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xb0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v8, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x90
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v7
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v7, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v6
|
|
; GCN-HSA-NEXT: v_bfe_u32 v10, v6, 16, 8
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 64
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v5, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v5
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 32
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v4, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v4
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v3, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[10:13]
|
|
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xe0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v12, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[10:13]
|
|
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN-HSA-NEXT: v_bfe_u32 v11, v9, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[9:12]
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 24, v5
|
|
; GCN-HSA-NEXT: v_bfe_u32 v0, v5, 16, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v16, v8, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff, v8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, v1
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x80
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[14:17]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, v1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v1
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_bfe_u32 v9, v7, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_u32 v13, v6, 8, 8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v6
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[7:10]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v53, 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, v53
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v32, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v34, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v53
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v9, v11, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 24, v15
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v48, v15, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 24, v16
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v45, v16, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v44, v16, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, v53
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, 0xff, v16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v41, v15, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, v53
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, 0xff, v15
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 24, v14
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, v53
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v36, v14, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 24, v10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, v53
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v2, v10, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xff, v10
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v3, v10, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff, v11
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v20, v12, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, 0xff, v12
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v24, 24, v12
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v22, v12, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v28, v13, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, 0xff, v13
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 24, v11
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v10, v11, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 24, v13
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v29, v13, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v35, v14, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v33, 0xff, v14
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v16, v17, 8, 8
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, 0xff, v17
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v54, 24, v17
|
|
; GCN-NOHSA-VI-NEXT: v_bfe_u32 v52, v17, 16, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v53
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[29:32], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v36, v53
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v53
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, v53
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[33:36], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_nop 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v53
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v32i8_to_v32i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @22
|
|
; EG-NEXT: ALU 103, @27, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ALU 33, @131, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T42.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T41.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T40.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T39.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T38.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T37.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T36.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T35.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T34.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T33.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T31.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T30.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T29.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T28.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T27.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 22:
|
|
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
|
|
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
|
|
; EG-NEXT: ALU clause starting at 26:
|
|
; EG-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 27:
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T13.X, T11.W, literal.x, PV.W,
|
|
; EG-NEXT: LSHR * T13.Z, T11.W, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T13.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T14.Z, T11.W, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T14.X, T11.W, literal.x,
|
|
; EG-NEXT: MOV * T14.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T15.X, T11.Z, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T15.Z, T11.Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T15.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T16.Z, T11.Z, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T16.X, T11.Z, literal.x,
|
|
; EG-NEXT: MOV * T16.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T17.X, T11.Y, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T17.Z, T11.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T17.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T18.Z, T11.Y, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T18.X, T11.Y, literal.x,
|
|
; EG-NEXT: MOV * T18.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T19.X, T11.X, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T19.Z, T11.X, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T19.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T11.Z, T11.X, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T11.X, T11.X, literal.x,
|
|
; EG-NEXT: MOV * T11.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T20.X, T12.W, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T20.Z, T12.W, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T20.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T21.Z, T12.W, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T21.X, T12.W, literal.x,
|
|
; EG-NEXT: MOV * T21.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T22.X, T12.Z, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T22.Z, T12.Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T22.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T23.Z, T12.Z, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T23.X, T12.Z, literal.x,
|
|
; EG-NEXT: MOV * T23.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T24.X, T12.Y, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T24.Z, T12.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T24.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T25.Z, T12.Y, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T25.X, T12.Y, literal.x,
|
|
; EG-NEXT: MOV * T25.Y, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T26.X, T12.X, literal.x, T0.W,
|
|
; EG-NEXT: LSHR * T26.Z, T12.X, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; EG-NEXT: MOV T26.Y, 0.0,
|
|
; EG-NEXT: BFE_UINT * T12.Z, T12.X, literal.x, T0.W,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T12.X, T12.X, literal.x,
|
|
; EG-NEXT: MOV T12.Y, 0.0,
|
|
; EG-NEXT: MOV T13.W, 0.0,
|
|
; EG-NEXT: MOV * T14.W, 0.0,
|
|
; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T15.W, 0.0,
|
|
; EG-NEXT: MOV * T16.W, 0.0,
|
|
; EG-NEXT: MOV T17.W, 0.0,
|
|
; EG-NEXT: MOV * T18.W, 0.0,
|
|
; EG-NEXT: MOV T19.W, 0.0,
|
|
; EG-NEXT: MOV * T11.W, 0.0,
|
|
; EG-NEXT: MOV T20.W, 0.0,
|
|
; EG-NEXT: MOV * T21.W, 0.0,
|
|
; EG-NEXT: MOV T22.W, 0.0,
|
|
; EG-NEXT: MOV * T23.W, 0.0,
|
|
; EG-NEXT: MOV T24.W, 0.0,
|
|
; EG-NEXT: MOV * T25.W, 0.0,
|
|
; EG-NEXT: MOV T26.W, 0.0,
|
|
; EG-NEXT: MOV * T12.W, 0.0,
|
|
; EG-NEXT: LSHR T27.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR * T31.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 131:
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
|
|
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
|
|
; EG-NEXT: LSHR T34.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
|
|
; EG-NEXT: LSHR T35.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
|
|
; EG-NEXT: LSHR T36.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
|
|
; EG-NEXT: LSHR T37.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
|
|
; EG-NEXT: LSHR T38.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
|
|
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
|
|
; EG-NEXT: LSHR T40.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
|
|
; EG-NEXT: LSHR T41.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
|
|
; EG-NEXT: LSHR * T42.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v32i8_to_v32i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @22
|
|
; CM-NEXT: ALU 103, @27, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: ALU 33, @131, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T42.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T41.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T40.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T39.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T38.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T37.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T18, T36.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T35.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T34.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T33.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T32.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T31.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T30.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T29.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T28.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T27.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 22:
|
|
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
|
|
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 26:
|
|
; CM-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 27:
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT * T13.Z, T11.X, literal.x, PV.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T13.X, T11.X, literal.x,
|
|
; CM-NEXT: MOV * T13.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T14.X, T11.X, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T14.Z, T11.X, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T14.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T15.Z, T11.Y, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T15.X, T11.Y, literal.x,
|
|
; CM-NEXT: MOV * T15.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T16.X, T11.Y, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T16.Z, T11.Y, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T16.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T17.Z, T11.Z, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T17.X, T11.Z, literal.x,
|
|
; CM-NEXT: MOV * T17.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T11.X, T11.Z, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T11.Z, T11.Z, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T11.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T18.Z, T11.W, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T18.X, T11.W, literal.x,
|
|
; CM-NEXT: MOV * T18.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T19.X, T11.W, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T19.Z, T11.W, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T19.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T20.Z, T12.X, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T20.X, T12.X, literal.x,
|
|
; CM-NEXT: MOV * T20.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T21.X, T12.X, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T21.Z, T12.X, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T21.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T22.Z, T12.Y, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T22.X, T12.Y, literal.x,
|
|
; CM-NEXT: MOV * T22.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T23.X, T12.Y, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T23.Z, T12.Y, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T23.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T24.Z, T12.Z, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T24.X, T12.Z, literal.x,
|
|
; CM-NEXT: MOV * T24.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T12.X, T12.Z, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T12.Z, T12.Z, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T12.Y, 0.0,
|
|
; CM-NEXT: BFE_UINT * T25.Z, T12.W, literal.x, T0.W,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T25.X, T12.W, literal.x,
|
|
; CM-NEXT: MOV * T25.Y, 0.0,
|
|
; CM-NEXT: 255(3.573311e-43), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_UINT T26.X, T12.W, literal.x, T0.W,
|
|
; CM-NEXT: LSHR * T26.Z, T12.W, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 24(3.363116e-44)
|
|
; CM-NEXT: MOV T26.Y, 0.0,
|
|
; CM-NEXT: MOV * T13.W, 0.0,
|
|
; CM-NEXT: MOV * T14.W, 0.0,
|
|
; CM-NEXT: MOV * T15.W, 0.0,
|
|
; CM-NEXT: MOV * T16.W, 0.0,
|
|
; CM-NEXT: MOV * T17.W, 0.0,
|
|
; CM-NEXT: MOV * T11.W, 0.0,
|
|
; CM-NEXT: MOV * T18.W, 0.0,
|
|
; CM-NEXT: MOV * T19.W, 0.0,
|
|
; CM-NEXT: MOV * T20.W, 0.0,
|
|
; CM-NEXT: MOV * T21.W, 0.0,
|
|
; CM-NEXT: MOV * T22.W, 0.0,
|
|
; CM-NEXT: MOV * T23.W, 0.0,
|
|
; CM-NEXT: MOV * T24.W, 0.0,
|
|
; CM-NEXT: MOV * T12.W, 0.0,
|
|
; CM-NEXT: MOV * T25.W, 0.0,
|
|
; CM-NEXT: MOV * T26.W, 0.0,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 240(3.363116e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T27.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 224(3.138909e-43)
|
|
; CM-NEXT: LSHR T28.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 208(2.914701e-43)
|
|
; CM-NEXT: LSHR T29.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 192(2.690493e-43)
|
|
; CM-NEXT: LSHR T30.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 176(2.466285e-43)
|
|
; CM-NEXT: ALU clause starting at 131:
|
|
; CM-NEXT: LSHR T31.X, T0.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 160(2.242078e-43)
|
|
; CM-NEXT: LSHR T32.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 144(2.017870e-43)
|
|
; CM-NEXT: LSHR T33.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 128(1.793662e-43)
|
|
; CM-NEXT: LSHR T34.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
|
|
; CM-NEXT: LSHR T35.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
|
|
; CM-NEXT: LSHR T36.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
|
|
; CM-NEXT: LSHR T37.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; CM-NEXT: LSHR T38.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; CM-NEXT: LSHR T40.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR * T41.X, PV.W, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR * T42.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <32 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <32 x i8> %load to <32 x i64>
|
|
store <32 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v32i8_to_v32i64:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v2
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s19, v3
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s22, v0
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s23, v1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s24, v6
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s25, v7
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v4
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s13, v5
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s18, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s18, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s34, s18, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s30, s22, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s22, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s22, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s4, s24, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s6, s24, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s8, s24, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s10, s12, 16
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s14, s12, 24
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s16, s12, 8
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s20, s19, 16
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[40:41], s[12:13], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[18:19], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[44:45], s[22:23], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s42
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s43
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[24:25], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s44
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s45
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s18, s19, 8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s42
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s43
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s19
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s40
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s41
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s40, s23, 16
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s38
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s39
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s38, s23, 8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s22, s23
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[12:13], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, s36
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, s37
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s36, s25, 16
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s24, s25, 8
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s30
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s33, s25, 31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s39, s23, 31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s41, s23, 24
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s5, s19, 31
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s7, s19, 24
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s44, s25, 24
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, s25
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s28
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s29
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s28, s13, 16
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s26
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s27
|
|
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s26, s13, 8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s45, s13, 31
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s46, s13, 24
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s30, s13
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[34:35], s[12:13], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[42:43], s[22:23], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[12:13], s[26:27], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[22:23], s[28:29], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[26:27], s[36:37], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[28:29], s[38:39], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[36:37], s[40:41], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s5
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s6
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s41
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s39
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s9
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s10
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s11
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s14
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s15
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s42
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s43
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, s16
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, s17
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s44
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s33
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s20
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s21
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s34
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s35
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, s18
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, s19
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, s46
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, s45
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s36
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s37
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[2:5], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v4, s30
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, s31
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s28
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s29
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, s26
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, s27
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s24
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s25
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, s22
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, s23
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, s12
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, s13
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i64:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v6
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s7, v7
|
|
; GCN-HSA-NEXT: s_lshr_b32 s20, s6, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s18, s6, 24
|
|
; GCN-HSA-NEXT: s_lshr_b32 s10, s8, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s2, s8, 24
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[8:9], 0x80000
|
|
; GCN-HSA-NEXT: s_lshr_b32 s16, s6, 8
|
|
; GCN-HSA-NEXT: s_lshr_b32 s4, s8, 8
|
|
; GCN-HSA-NEXT: s_lshr_b32 s12, s7, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s14, s7, 8
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[6:7], 0x80000
|
|
; GCN-HSA-NEXT: s_lshr_b32 s6, s9, 16
|
|
; GCN-HSA-NEXT: s_mov_b32 s28, s9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s26
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s27
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[18:19], 0x80000
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s40, v2
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s41, v3
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[10:11], 0x80000
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s44, v0
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s45, v1
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
|
|
; GCN-HSA-NEXT: s_mov_b32 s22, s7
|
|
; GCN-HSA-NEXT: s_lshr_b32 s8, s9, 8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s24
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s25
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[30:31], s[16:17], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[10:11], s[28:29], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[6:7], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[18:19], s[14:15], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[24:25], s[12:13], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[46:47], s[4:5], 0x80000
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s20
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s26
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s27
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s42
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN-HSA-NEXT: s_lshr_b32 s42, s44, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s48, s44, 24
|
|
; GCN-HSA-NEXT: s_lshr_b32 s28, s44, 8
|
|
; GCN-HSA-NEXT: s_lshr_b32 s6, s45, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s2, s45, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 s4, s45
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[26:27], s[44:45], 0x80000
|
|
; GCN-HSA-NEXT: s_lshr_b32 s44, s40, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s50, s40, 24
|
|
; GCN-HSA-NEXT: s_lshr_b32 s52, s40, 8
|
|
; GCN-HSA-NEXT: s_lshr_b32 s20, s41, 16
|
|
; GCN-HSA-NEXT: s_lshr_b32 s12, s41, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 s14, s41
|
|
; GCN-HSA-NEXT: s_ashr_i32 s33, s9, 31
|
|
; GCN-HSA-NEXT: s_ashr_i32 s37, s7, 31
|
|
; GCN-HSA-NEXT: s_ashr_i32 s38, s7, 24
|
|
; GCN-HSA-NEXT: s_ashr_i32 s34, s9, 24
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s21
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s30
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s31
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s43
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN-HSA-NEXT: s_ashr_i32 s30, s45, 31
|
|
; GCN-HSA-NEXT: s_ashr_i32 s31, s45, 24
|
|
; GCN-HSA-NEXT: s_ashr_i32 s35, s41, 31
|
|
; GCN-HSA-NEXT: s_ashr_i32 s36, s41, 24
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
|
|
; GCN-HSA-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
|
|
; GCN-HSA-NEXT: s_add_u32 s54, s0, 0x50
|
|
; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s54
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s55
|
|
; GCN-HSA-NEXT: s_add_u32 s54, s0, 64
|
|
; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s54
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s55
|
|
; GCN-HSA-NEXT: s_add_u32 s54, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s55, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s40
|
|
; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xd0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s41
|
|
; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v28, s54
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s40
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s41
|
|
; GCN-HSA-NEXT: s_add_u32 s40, s0, 0xc0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v29, s55
|
|
; GCN-HSA-NEXT: s_addc_u32 s41, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[0:3]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26
|
|
; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x90
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27
|
|
; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s26
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s27
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
|
; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x80
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s46
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s47
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
|
; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s44
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s24
|
|
; GCN-HSA-NEXT: s_add_u32 s24, s0, 0x70
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s45
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s50
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s51
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s40
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s25
|
|
; GCN-HSA-NEXT: s_addc_u32 s25, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s52
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s53
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s42
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s43
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s41
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s48
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s49
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s26
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[16:19]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[20:23]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s18
|
|
; GCN-HSA-NEXT: s_add_u32 s18, s0, 0x60
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s28
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s29
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s27
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[12:15]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s19
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s24
|
|
; GCN-HSA-NEXT: s_addc_u32 s19, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s18
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s22
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s38
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s23
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s25
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s19
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[0:3]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s16
|
|
; GCN-HSA-NEXT: s_add_u32 s16, s0, 48
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17
|
|
; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s34
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s33
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_nop 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN-HSA-NEXT: s_add_u32 s8, s0, 32
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s11
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
|
|
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xf0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
|
|
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xe0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_nop 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0xa0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v32i8_to_v32i64:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v7
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v5
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s38, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s40, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s42, s4, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s60, s9, 8
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s39, s11, 24
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s48, s6, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s56, s10, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s58, s10, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s36, s10, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s62, s11, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s64, s11, 8
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s66, s11
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[20:21], s[10:11], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[68:69], s[4:5], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s33, s11, 31
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[10:11], s[60:61], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[60:61], s[38:39], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s50, s8, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s52, s8, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[34:35], s[6:7], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s68
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s69
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s60
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s61
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s40
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s41
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s42
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s43
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s54, s8, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:208
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[28:29], s[8:9], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s34
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s35
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s48
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s49
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:128
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s50
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s51
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s52
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s53
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:80
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s30, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s28
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s29
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s54
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s55
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s24, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s56
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s57
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s58
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s59
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s26, s5
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s71, s5, 31
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s72, s5, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s22, s7, 16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s20
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s21
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s36
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s37
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s7, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s30
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s31
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s72
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s71
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s18, s7
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s67, s7, 31
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s70, s7, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s9, 16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s26
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s27
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s24
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s25
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s12, s9
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s22
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s23
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s70
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s67
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s63, s9, 31
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s65, s9, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:176
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s18
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s19
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s17
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s44, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s14
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s15
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s65
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s63
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s46, s6, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[8:9], s[62:63], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:112
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[6:7], s[66:67], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s12
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s13
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[4:5], s[64:65], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x80000
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s44
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s39
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s33
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s45
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s46
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s47
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:144
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v32i8_to_v32i64:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @22
|
|
; EG-NEXT: ALU 84, @27, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: ALU 71, @112, KC0[], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T42.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T31.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T30.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T25.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T24.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T23.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T22.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T21.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T20.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T18.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T17.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T16.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T15.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T14.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T13.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 22:
|
|
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
|
|
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
|
|
; EG-NEXT: ALU clause starting at 26:
|
|
; EG-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 27:
|
|
; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
|
|
; EG-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
|
|
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
|
|
; EG-NEXT: LSHR T19.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
|
|
; EG-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
|
|
; EG-NEXT: LSHR T21.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
|
|
; EG-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
|
|
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
|
|
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
|
|
; EG-NEXT: LSHR * T25.X, PV.W, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT * T26.X, T11.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T27.X, T11.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T26.Y, PV.X, literal.y,
|
|
; EG-NEXT: LSHR * T0.W, T11.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T28.X, T11.X, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T27.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T26.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T29.X, T12.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T28.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T27.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T0.W, T11.X, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T30.X, PS, literal.x,
|
|
; EG-NEXT: ASHR T29.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T28.Z, PV.W, 0.0, literal.z,
|
|
; EG-NEXT: LSHR T0.W, T12.W, literal.z,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; EG-NEXT: 8(1.121039e-44), 224(3.138909e-43)
|
|
; EG-NEXT: LSHR T31.X, PS, literal.x,
|
|
; EG-NEXT: BFE_INT T29.Z, PV.W, 0.0, literal.y,
|
|
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.z,
|
|
; EG-NEXT: ASHR * T32.W, T12.X, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; EG-NEXT: 240(3.363116e-43), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T33.X, T12.Z, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T0.Y, T11.Z, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ASHR T32.Z, T12.X, literal.y,
|
|
; EG-NEXT: LSHR T1.W, T12.X, literal.z,
|
|
; EG-NEXT: ASHR * T34.W, T12.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 24(3.363116e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T32.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T33.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T34.Z, T12.Y, literal.z,
|
|
; EG-NEXT: LSHR T1.W, T12.Z, literal.x,
|
|
; EG-NEXT: LSHR * T2.W, T12.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT * T34.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 112:
|
|
; EG-NEXT: ASHR T32.Y, T32.X, literal.x,
|
|
; EG-NEXT: BFE_INT T33.Z, T1.W, 0.0, literal.y,
|
|
; EG-NEXT: LSHR T1.W, T11.W, literal.z, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ASHR * T35.W, T12.Z, literal.x,
|
|
; EG-NEXT: 31(4.344025e-44), 8(1.121039e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T36.X, T12.X, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T34.Y, T34.X, literal.y, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ASHR T35.Z, T12.Z, literal.z,
|
|
; EG-NEXT: LSHR T2.W, T12.Z, literal.w,
|
|
; EG-NEXT: ASHR * T37.W, T12.W, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T35.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T36.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T37.Z, T12.W, literal.z,
|
|
; EG-NEXT: LSHR T2.W, T12.X, literal.x,
|
|
; EG-NEXT: LSHR * T3.W, T12.W, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T37.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T35.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T36.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: LSHR T2.W, T11.Z, literal.z,
|
|
; EG-NEXT: ASHR * T12.W, T11.X, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T38.X, T12.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T37.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T12.Z, T11.X, literal.z,
|
|
; EG-NEXT: LSHR T3.W, T11.X, literal.w,
|
|
; EG-NEXT: ASHR * T39.W, T11.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T12.X, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T38.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T39.Z, T11.Y, literal.z,
|
|
; EG-NEXT: LSHR T3.W, T12.Y, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: LSHR * T4.W, T11.Y, literal.w,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: BFE_INT T39.X, PS, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T38.Z, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T36.W, T36.Z, literal.y,
|
|
; EG-NEXT: ASHR * T40.W, T11.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: BFE_INT T11.X, T11.Z, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T39.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T40.Z, T11.Z, literal.z,
|
|
; EG-NEXT: ASHR T38.W, PV.Z, literal.y,
|
|
; EG-NEXT: ASHR * T41.W, T11.W, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T40.X, T2.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T41.Z, T11.W, literal.z, BS:VEC_120/SCL_212
|
|
; EG-NEXT: ASHR T33.W, T33.Z, literal.y,
|
|
; EG-NEXT: ASHR * T29.W, T29.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T41.X, T1.W, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
|
|
; EG-NEXT: BFE_INT T11.Z, T0.Y, 0.0, literal.x,
|
|
; EG-NEXT: ASHR T28.W, T28.Z, literal.y,
|
|
; EG-NEXT: ASHR * T27.W, T27.Z, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; EG-NEXT: LSHR T42.X, T0.W, literal.x,
|
|
; EG-NEXT: ASHR T41.Y, PV.X, literal.y,
|
|
; EG-NEXT: ASHR T11.W, PV.Z, literal.y,
|
|
; EG-NEXT: ASHR * T26.W, T26.Z, literal.y,
|
|
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
;
|
|
; CM-LABEL: global_sextload_v32i8_to_v32i64:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @26, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @22
|
|
; CM-NEXT: ALU 84, @27, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: ALU 74, @112, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T42.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T31.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T30.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T25.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T24.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T39, T23.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T22.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T21.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T20.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T19.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T18.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T17.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T16.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T15.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T14.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T13.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 22:
|
|
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
|
|
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 26:
|
|
; CM-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 27:
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 240(3.363116e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T13.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 224(3.138909e-43)
|
|
; CM-NEXT: LSHR T14.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 208(2.914701e-43)
|
|
; CM-NEXT: LSHR T15.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 192(2.690493e-43)
|
|
; CM-NEXT: LSHR T16.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 176(2.466285e-43)
|
|
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 160(2.242078e-43)
|
|
; CM-NEXT: LSHR T18.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 144(2.017870e-43)
|
|
; CM-NEXT: LSHR T19.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 128(1.793662e-43)
|
|
; CM-NEXT: LSHR T20.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
|
|
; CM-NEXT: LSHR T21.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T0.Z, T11.X, literal.y,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T0.Y, T11.X, literal.y,
|
|
; CM-NEXT: LSHR T1.Z, T11.Y, literal.y,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T23.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T1.Y, T11.Z, literal.y,
|
|
; CM-NEXT: LSHR T2.Z, T11.Z, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: 16(2.242078e-44), 64(8.968310e-44)
|
|
; CM-NEXT: LSHR T24.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T2.Y, T11.W, literal.y,
|
|
; CM-NEXT: LSHR T3.Z, T11.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T25.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR T3.Y, T12.X, literal.y,
|
|
; CM-NEXT: LSHR T4.Z, T12.Y, literal.y,
|
|
; CM-NEXT: LSHR * T0.W, T12.X, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T26.X, T12.W, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T4.Y, T12.Y, literal.x,
|
|
; CM-NEXT: ADD_INT T5.Z, KC0[2].Y, literal.y,
|
|
; CM-NEXT: LSHR * T1.W, T11.Y, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: BFE_INT T27.X, T12.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T26.Y, PV.X, literal.y,
|
|
; CM-NEXT: ADD_INT T6.Z, KC0[2].Y, literal.z,
|
|
; CM-NEXT: LSHR * T2.W, T12.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T28.X, T11.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T27.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T26.Z, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: LSHR * T2.W, T12.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T29.X, T12.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T28.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T27.Z, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T26.W, PV.Z, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: LSHR T30.X, T6.Z, literal.x,
|
|
; CM-NEXT: ASHR T29.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T28.Z, T1.W, 0.0, literal.z,
|
|
; CM-NEXT: ASHR * T27.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T31.X, T5.Z, literal.x,
|
|
; CM-NEXT: LSHR * T5.Y, T12.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
|
|
; CM-NEXT: ALU clause starting at 112:
|
|
; CM-NEXT: BFE_INT T29.Z, T4.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T32.W, T12.W, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T33.X, T12.X, 0.0, literal.x,
|
|
; CM-NEXT: LSHR T4.Y, T12.W, literal.y,
|
|
; CM-NEXT: ASHR T32.Z, T12.W, literal.z,
|
|
; CM-NEXT: ASHR * T34.W, T12.Z, literal.w,
|
|
; CM-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T32.X, PV.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T33.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T34.Z, T12.Z, literal.z,
|
|
; CM-NEXT: ASHR * T29.W, T29.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T34.X, T5.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T32.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T33.Z, T0.W, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T35.W, T12.Y, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T36.X, T11.W, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T34.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T35.Z, T12.Y, literal.z,
|
|
; CM-NEXT: ASHR * T12.W, T12.X, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T35.X, T4.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T36.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T12.Z, T12.X, literal.z,
|
|
; CM-NEXT: ASHR * T33.W, T33.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T12.X, T3.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T35.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T36.Z, T3.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T37.W, T11.W, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T38.X, T11.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T12.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T37.Z, T11.W, literal.z,
|
|
; CM-NEXT: ASHR * T39.W, T11.Z, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T37.X, T2.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T38.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T39.Z, T11.Z, literal.z,
|
|
; CM-NEXT: ASHR * T36.W, T36.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T39.X, T2.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T37.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T38.Z, T1.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T40.W, T11.Y, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: BFE_INT T41.X, T11.X, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T39.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T40.Z, T11.Y, literal.z,
|
|
; CM-NEXT: ASHR * T11.W, T11.X, literal.y,
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T40.X, T1.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T41.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR T11.Z, T11.X, literal.z,
|
|
; CM-NEXT: ASHR * T38.W, T38.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T11.X, T0.Y, 0.0, literal.x,
|
|
; CM-NEXT: ASHR T40.Y, PV.X, literal.y,
|
|
; CM-NEXT: BFE_INT T41.Z, T0.Z, 0.0, literal.x,
|
|
; CM-NEXT: ASHR * T28.W, T28.Z, literal.y, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 31(4.344025e-44)
|
|
; CM-NEXT: LSHR T42.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: ASHR T11.Y, PV.X, literal.y,
|
|
; CM-NEXT: ASHR * T41.W, PV.Z, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
|
|
%load = load <32 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <32 x i8> %load to <32 x i64>
|
|
store <32 x i64> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
|
|
; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; %load = load <64 x i8>, ptr addrspace(1) %in
|
|
; %ext = zext <64 x i8> %load to <64 x i64>
|
|
; store <64 x i64> %ext, ptr addrspace(1) %out
|
|
; ret void
|
|
; }
|
|
|
|
; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
|
|
; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; %load = load <64 x i8>, ptr addrspace(1) %in
|
|
; %ext = sext <64 x i8> %load to <64 x i64>
|
|
; store <64 x i64> %ext, ptr addrspace(1) %out
|
|
; ret void
|
|
; }
|
|
|
|
define amdgpu_kernel void @global_zextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_i8_to_i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_i8_to_i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_i8_to_i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_i8_to_i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL T0.X, T0.X, PV.W,
|
|
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T0.Y, 0.0,
|
|
; EG-NEXT: MOV * T0.Z, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_i8_to_i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHL T0.X, T0.X, PV.W,
|
|
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, 0.0,
|
|
; CM-NEXT: MOV * T0.Z, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%a = load i8, ptr addrspace(1) %in
|
|
%ext = zext i8 %a to i16
|
|
store i16 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_i8_to_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_i8_to_i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_i8_to_i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_sbyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_i8_to_i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_i8_to_i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, PS, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
|
|
; EG-NEXT: LSHL T0.X, PV.W, PS,
|
|
; EG-NEXT: LSHL * T0.W, literal.x, PS,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T0.Y, 0.0,
|
|
; EG-NEXT: MOV * T0.Z, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_sextload_i8_to_i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.y,
|
|
; CM-NEXT: 3(4.203895e-45), 8(1.121039e-44)
|
|
; CM-NEXT: AND_INT T1.Z, PV.W, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
|
|
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
|
|
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, 0.0,
|
|
; CM-NEXT: MOV * T0.Z, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%a = load i8, ptr addrspace(1) %in
|
|
%ext = sext i8 %a to i16
|
|
store i16 %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v1i8_to_v1i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v1i8_to_v1i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ubyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v1i8_to_v1i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ubyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v1i8_to_v1i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL T0.X, T0.X, PV.W,
|
|
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T0.Y, 0.0,
|
|
; EG-NEXT: MOV * T0.Z, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v1i8_to_v1i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 10, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
|
|
; CM-NEXT: LSHL T0.X, T0.X, PV.W,
|
|
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, 0.0,
|
|
; CM-NEXT: MOV * T0.Z, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <1 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <1 x i8> %load to <1 x i16>
|
|
store <1 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v1i8_to_v1i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v1i8_to_v1i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_sbyte v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v1i8_to_v1i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_sbyte v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v1i8_to_v1i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: BFE_INT T0.W, T0.X, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 3(4.203895e-45)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: LSHL * T1.W, PS, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
|
|
; EG-NEXT: LSHL T0.X, PV.W, PS,
|
|
; EG-NEXT: LSHL * T0.W, literal.x, PS,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T0.Y, 0.0,
|
|
; EG-NEXT: MOV * T0.Z, 0.0,
|
|
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_sextload_v1i8_to_v1i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_8 T0.X, T0.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: AND_INT T0.Z, KC0[2].Y, literal.x,
|
|
; CM-NEXT: BFE_INT * T0.W, T0.X, 0.0, literal.y,
|
|
; CM-NEXT: 3(4.203895e-45), 8(1.121039e-44)
|
|
; CM-NEXT: AND_INT T1.Z, PV.W, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.Z, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
|
|
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
|
|
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
|
|
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; CM-NEXT: MOV T0.Y, 0.0,
|
|
; CM-NEXT: MOV * T0.Z, 0.0,
|
|
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <1 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <1 x i8> %load to <1 x i16>
|
|
store <1 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v2i8_to_v2i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xff00ff, v0
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v2i8_to_v2i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v3, 8, v2
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v2, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00ff, v2
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v2i8_to_v2i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff0000, v1
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v2i8_to_v2i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 9:
|
|
; EG-NEXT: LSHL * T0.W, T5.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T5.X, literal.y,
|
|
; EG-NEXT: 16711680(2.341805e-38), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT T5.X, PS, PV.W,
|
|
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_zextload_v2i8_to_v2i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T6.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 9:
|
|
; CM-NEXT: LSHL * T0.W, T5.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T5.X, literal.y,
|
|
; CM-NEXT: 16711680(2.341805e-38), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T5.X, PV.W, PV.Z,
|
|
; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <2 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <2 x i8> %load to <2 x i16>
|
|
store <2 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v2i8_to_v2i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v1, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v0, v1
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v2i8_to_v2i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_ushort v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_bfe_i32 v3, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v2, v3
|
|
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v2i8_to_v2i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0xffff
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, 8
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_sdwa v0, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v1, v0
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v2i8_to_v2i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 16, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T6.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T2.X,
|
|
; EG-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T5.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T2.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.Z, T0.W, 0.0, literal.x,
|
|
; EG-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHL T0.W, PV.W, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Z, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT T5.X, PS, PV.W,
|
|
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
;
|
|
; CM-LABEL: global_sextload_v2i8_to_v2i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 16, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5.X, T6.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_16 T5.X, T5.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T2.X,
|
|
; CM-NEXT: MOV * T5.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T2.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, PV.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT T0.Z, T0.W, 0.0, literal.x,
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHL T1.Z, PV.W, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.Z, literal.y,
|
|
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T5.X, PV.W, PV.Z,
|
|
; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
%load = load <2 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <2 x i8> %load to <2 x i16>
|
|
store <2 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v4i8_to_v4i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v4i8_to_v4i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T4.X,
|
|
; EG-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T7.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T0.W, T7.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T0.W, T7.X, literal.x, PV.W,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.X, PV.Y,
|
|
; EG-NEXT: MOV * T8.X, T4.X,
|
|
;
|
|
; CM-LABEL: global_zextload_v4i8_to_v4i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 31, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T7.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T4.X,
|
|
; CM-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T7.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T0.W, T7.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T5.X,
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T0.W, T7.X, literal.y, PV.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T7.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: OR_INT * T8.Y, PV.Z, PV.W,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T5.X, PV.Y,
|
|
; CM-NEXT: MOV * T8.X, T4.X,
|
|
%load = load <4 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <4 x i8> %load to <4 x i16>
|
|
store <4 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v4i8_to_v4i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 16, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v3, v0, 8, 8
|
|
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 8
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v2, v1
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v0, v3
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v4i8_to_v4i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 24, v2
|
|
; GCN-HSA-NEXT: v_bfe_i32 v4, v2, 16, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v5, v2, 8, 8
|
|
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 16, v5
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v3, v4, v3
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v2, v5
|
|
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v4i8_to_v4i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, 0xffff
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_sdwa v4, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
|
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v0, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 16, v0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_sdwa v2, v2, sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v4, v1
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v2, v5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v4i8_to_v4i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XY, T7.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T4.X,
|
|
; EG-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T7.X, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: OR_INT * T8.Y, PV.W, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.X, PV.Y,
|
|
; EG-NEXT: MOV * T8.X, T4.X,
|
|
;
|
|
; CM-LABEL: global_sextload_v4i8_to_v4i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 37, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T7.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T4.X,
|
|
; CM-NEXT: MOV * T7.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: BFE_INT * T0.W, T7.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T7.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T5.X,
|
|
; CM-NEXT: LSHR * T0.W, T7.X, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T7.X, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: OR_INT * T8.Y, PV.Z, PV.W,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T5.X, PV.Y,
|
|
; CM-NEXT: MOV * T8.X, T4.X,
|
|
%load = load <4 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <4 x i8> %load to <4 x i16>
|
|
store <4 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v8i8_to_v8i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v8i8_to_v8i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 61, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T8.X,
|
|
; EG-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: AND_INT T0.W, T11.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T0.W, T11.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T9.X,
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T1.W, T11.X, literal.x, PV.W,
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), -65536(nan)
|
|
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T9.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T9.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T4.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T11.Y, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T11.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: BFE_UINT * T0.W, T11.Y, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: OR_INT * T12.W, PV.W, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.X, PV.W,
|
|
; EG-NEXT: MOV * T12.X, T8.X,
|
|
; EG-NEXT: MOV * T12.Z, T4.X,
|
|
;
|
|
; CM-LABEL: global_zextload_v8i8_to_v8i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 60, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T11.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T8.X,
|
|
; CM-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: AND_INT T0.Z, T11.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T0.W, T11.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T9.X,
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T11.X, literal.y, PV.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T9.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T11.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T12.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T9.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T4.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T11.Y, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T11.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T5.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T0.W, T11.Y, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: OR_INT * T12.W, PV.Z, PV.W,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T12.X, T8.X,
|
|
; CM-NEXT: MOV * T12.Z, T4.X, BS:VEC_120/SCL_212
|
|
%load = load <8 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <8 x i8> %load to <8 x i16>
|
|
store <8 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v8i8_to_v8i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s6, s1, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s7, s1, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s8, s1, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s1, s1
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s9, s0, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s10, s0, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s11, s0, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s0, s0
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s6, s6, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s1, s1, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s11, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s7, s6
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s1, s8
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s10, s9
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s0, s11
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v8i8_to_v8i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s1, v1
|
|
; GCN-HSA-NEXT: s_ashr_i32 s2, s1, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s3, s1, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s4, s1, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s1, s1
|
|
; GCN-HSA-NEXT: s_ashr_i32 s5, s0, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s6, s0, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s7, s0, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s0, s0
|
|
; GCN-HSA-NEXT: s_lshl_b32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s4, s4, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s1, s1, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s5, s5, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s7, s7, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GCN-HSA-NEXT: s_or_b32 s2, s3, s2
|
|
; GCN-HSA-NEXT: s_or_b32 s1, s1, s4
|
|
; GCN-HSA-NEXT: s_or_b32 s3, s6, s5
|
|
; GCN-HSA-NEXT: s_or_b32 s0, s0, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v8i8_to_v8i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s8, s5
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s11, s4
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s9, s5, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s4, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 24
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s7, s7, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s6, s6, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s9, 0xffff, s9
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, 0xffff, s7
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, 0xffff, s6
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s10, s11
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s7, s5
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s6, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s9
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v8i8_to_v8i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @6
|
|
; EG-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T11.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 6:
|
|
; EG-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 8:
|
|
; EG-NEXT: MOV * T0.Y, T8.X,
|
|
; EG-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T11.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T9.X,
|
|
; EG-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T9.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T11.X, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T12.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T9.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T4.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T11.Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: OR_INT * T12.W, PV.W, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.X, PV.W,
|
|
; EG-NEXT: MOV * T12.X, T8.X,
|
|
; EG-NEXT: MOV * T12.Z, T4.X,
|
|
;
|
|
; CM-LABEL: global_sextload_v8i8_to_v8i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @8, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @6
|
|
; CM-NEXT: ALU 74, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T11.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 6:
|
|
; CM-NEXT: VTX_READ_64 T11.XY, T11.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 8:
|
|
; CM-NEXT: MOV * T0.Y, T8.X,
|
|
; CM-NEXT: MOV * T11.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: BFE_INT * T0.W, T11.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T11.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T9.X,
|
|
; CM-NEXT: LSHR * T0.W, T11.X, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T9.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T11.X, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T12.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T9.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T4.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T11.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T5.X,
|
|
; CM-NEXT: LSHR * T0.W, T11.Y, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T11.Y, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: OR_INT * T12.W, PV.Z, PV.W,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T12.X, T8.X,
|
|
; CM-NEXT: MOV * T12.Z, T4.X, BS:VEC_120/SCL_212
|
|
%load = load <8 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <8 x i8> %load to <8 x i16>
|
|
store <8 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v16i8_to_v16i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v16i8_to_v16i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v16i8_to_v16i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @8
|
|
; EG-NEXT: ALU 103, @12, KC0[], KC1[]
|
|
; EG-NEXT: ALU 20, @116, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 8:
|
|
; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: MOV * T0.Y, T16.X,
|
|
; EG-NEXT: MOV * T19.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 12:
|
|
; EG-NEXT: AND_INT T0.W, T19.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T0.W, T19.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T17.X,
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T1.W, T19.X, literal.x, PV.W,
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), -65536(nan)
|
|
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T17.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T19.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T17.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T12.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T19.Y, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T19.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T13.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T19.Y, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T13.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T19.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T20.W, PV.W, PS,
|
|
; EG-NEXT: MOV T13.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, T8.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T19.Z, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T19.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T9.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T19.Z, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T9.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T19.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T19.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T9.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T4.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T19.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T19.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 116:
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR T0.W, T19.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T21.X, PS, literal.x,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; EG-NEXT: 16711680(2.341805e-38), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: OR_INT * T19.W, PV.W, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.X, PV.W,
|
|
; EG-NEXT: MOV * T20.X, T16.X,
|
|
; EG-NEXT: MOV * T20.Z, T12.X,
|
|
; EG-NEXT: MOV T19.X, T8.X,
|
|
; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
|
|
;
|
|
; CM-LABEL: global_zextload_v16i8_to_v16i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @8
|
|
; CM-NEXT: ALU 101, @12, KC0[], KC1[]
|
|
; CM-NEXT: ALU 20, @114, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T22.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T21.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 8:
|
|
; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: MOV * T0.Y, T16.X,
|
|
; CM-NEXT: MOV * T19.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 12:
|
|
; CM-NEXT: AND_INT T0.Z, T19.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T0.W, T19.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T17.X,
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T19.X, literal.y, PV.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T17.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T19.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T20.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T17.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T12.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T19.Y, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T12.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T19.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T12.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T13.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T19.Y, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T13.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T19.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T20.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T13.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T8.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T19.Z, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T19.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T8.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T9.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T19.Z, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T9.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T19.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T19.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T9.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T4.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T19.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T19.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T5.X,
|
|
; CM-NEXT: AND_INT * T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; CM-NEXT: ALU clause starting at 114:
|
|
; CM-NEXT: BFE_UINT * T0.W, T19.W, literal.x, T0.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: OR_INT * T0.W, T0.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: AND_INT T0.Y, PV.Y, literal.y,
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; CM-NEXT: 16711680(2.341805e-38), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; CM-NEXT: OR_INT * T19.W, PV.Y, PV.Z,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T20.X, T16.X,
|
|
; CM-NEXT: MOV * T20.Z, T12.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T19.X, T8.X,
|
|
; CM-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
|
|
%load = load <16 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <16 x i8> %load to <16 x i16>
|
|
store <16 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v16i8_to_v16i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s8, s7, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s9, s7, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s10, s7, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s7, s7
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s11, s6, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s12, s6, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s13, s6, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s6, s6
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s14, s5, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s15, s5, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s16, s5, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s5, s5
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s17, s4, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s18, s4, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s19, s4, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s4, s4
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s10, s10, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s11, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s13, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s15, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s16, s16, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s17, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s19, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s9, s8
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s7, s10
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s12, s11
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s10, s15, s14
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s5, s16
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s18, s17
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s4, s4, s19
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s6, s13
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s11
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s8
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v16i8_to_v16i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s7, s5, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s8, s5, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s5, s5
|
|
; GCN-HSA-NEXT: s_ashr_i32 s9, s4, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s10, s4, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s11, s4, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s4, s4
|
|
; GCN-HSA-NEXT: s_ashr_i32 s12, s3, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s13, s3, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s14, s3, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s3, s3
|
|
; GCN-HSA-NEXT: s_ashr_i32 s15, s2, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s16, s2, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s17, s2, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s2, s2
|
|
; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s11, s11, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s17, s17, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff
|
|
; GCN-HSA-NEXT: s_or_b32 s6, s7, s6
|
|
; GCN-HSA-NEXT: s_or_b32 s5, s5, s8
|
|
; GCN-HSA-NEXT: s_or_b32 s7, s10, s9
|
|
; GCN-HSA-NEXT: s_or_b32 s4, s4, s11
|
|
; GCN-HSA-NEXT: s_or_b32 s8, s13, s12
|
|
; GCN-HSA-NEXT: s_or_b32 s3, s3, s14
|
|
; GCN-HSA-NEXT: s_or_b32 s9, s16, s15
|
|
; GCN-HSA-NEXT: s_or_b32 s2, s2, s17
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s6
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v16i8_to_v16i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v2
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v3
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s7, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s12, s5
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s5, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s16, s7
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s18, s6
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s4
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s4, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s7, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s6, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 24
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s11, s11, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s10, s10, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s18, s18, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s9, s9, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s8, s8, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, 0xffff, s11
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, 0xffff, s10
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s16, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s18, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s9, 0xffff, s9
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, 0xffff, s8
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, 0xffff, s13
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s5
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s10, s4
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s17, s16
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s19, s18
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s9, s7
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s13, s12
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s15, s14
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s13
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s12
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v16i8_to_v16i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 0 @8
|
|
; EG-NEXT: ALU 104, @12, KC0[], KC1[]
|
|
; EG-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T22.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T21.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: PAD
|
|
; EG-NEXT: Fetch clause starting at 8:
|
|
; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 10:
|
|
; EG-NEXT: MOV * T0.Y, T16.X,
|
|
; EG-NEXT: MOV * T19.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 12:
|
|
; EG-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T17.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T17.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T19.X, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T20.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T17.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T12.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T13.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T13.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T19.Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T20.W, PV.W, PS,
|
|
; EG-NEXT: MOV T13.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T8.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T9.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T9.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T19.Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: ALU clause starting at 117:
|
|
; EG-NEXT: OR_INT * T19.Y, T1.W, T0.W,
|
|
; EG-NEXT: MOV T9.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T4.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: LSHR * T0.W, T19.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR T0.W, T19.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 24(3.363116e-44), 16(2.242078e-44)
|
|
; EG-NEXT: LSHR T21.X, PS, literal.x,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.z,
|
|
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T22.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: OR_INT * T19.W, PV.W, PS,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T5.X, PV.W,
|
|
; EG-NEXT: MOV * T20.X, T16.X,
|
|
; EG-NEXT: MOV * T20.Z, T12.X,
|
|
; EG-NEXT: MOV T19.X, T8.X,
|
|
; EG-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
|
|
;
|
|
; CM-LABEL: global_sextload_v16i8_to_v16i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 0 @8
|
|
; CM-NEXT: ALU 104, @12, KC0[], KC1[]
|
|
; CM-NEXT: ALU 46, @117, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T22.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T21.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: PAD
|
|
; CM-NEXT: Fetch clause starting at 8:
|
|
; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 10:
|
|
; CM-NEXT: MOV * T0.Y, T16.X,
|
|
; CM-NEXT: MOV * T19.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 12:
|
|
; CM-NEXT: BFE_INT * T0.W, T19.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T17.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.X, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T17.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T19.X, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T20.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T17.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T12.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T19.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T12.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T12.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T13.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.Y, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T13.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T19.Y, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T20.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T13.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T8.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T19.Z, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T9.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.Z, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T9.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T19.Z, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: ALU clause starting at 117:
|
|
; CM-NEXT: OR_INT * T19.Y, T0.Z, T0.W,
|
|
; CM-NEXT: MOV T9.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T4.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T19.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T5.X,
|
|
; CM-NEXT: LSHR * T0.W, T19.W, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T19.W, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: AND_INT T0.Y, PV.Y, literal.y,
|
|
; CM-NEXT: LSHL T0.Z, PV.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
|
|
; CM-NEXT: OR_INT * T19.W, PV.Y, PV.Z,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T20.X, T16.X,
|
|
; CM-NEXT: MOV * T20.Z, T12.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T19.X, T8.X,
|
|
; CM-NEXT: MOV * T19.Z, T4.X, BS:VEC_120/SCL_212
|
|
%load = load <16 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <16 x i8> %load to <16 x i16>
|
|
store <16 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_zextload_v32i8_to_v32i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16
|
|
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0
|
|
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17
|
|
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3
|
|
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14
|
|
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_zextload_v32i8_to_v32i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9]
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1
|
|
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
|
; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16
|
|
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8
|
|
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12
|
|
; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6
|
|
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6
|
|
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9
|
|
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1
|
|
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3
|
|
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_zextload_v32i8_to_v32i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @10
|
|
; EG-NEXT: ALU 103, @16, KC0[], KC1[]
|
|
; EG-NEXT: ALU 104, @120, KC0[], KC1[]
|
|
; EG-NEXT: ALU 41, @225, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 10:
|
|
; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
|
|
; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 14:
|
|
; EG-NEXT: MOV * T0.Y, T16.X,
|
|
; EG-NEXT: MOV * T35.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 16:
|
|
; EG-NEXT: AND_INT T0.W, T37.X, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T0.W, T37.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T17.X,
|
|
; EG-NEXT: MOV * T0.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_UINT T1.W, T37.X, literal.x, PV.W,
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.y,
|
|
; EG-NEXT: 16(2.242078e-44), -65536(nan)
|
|
; EG-NEXT: OR_INT * T1.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T17.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T37.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T17.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T12.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T37.Y, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T37.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T13.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T37.Y, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T13.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T37.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T36.W, PV.W, PS,
|
|
; EG-NEXT: MOV T13.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, T8.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T37.Z, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T37.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T9.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T37.Z, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T9.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T37.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T37.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T9.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T4.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T37.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T37.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 120:
|
|
; EG-NEXT: AND_INT * T2.W, T0.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T37.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T37.W, PV.W, PS,
|
|
; EG-NEXT: MOV T5.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, T32.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T35.X, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T32.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T35.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T32.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T33.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T35.X, literal.x, T0.W, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T33.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T35.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T33.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T28.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T35.Y, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T28.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T35.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T28.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T29.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T35.Y, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T29.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T35.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T38.W, PV.W, PS,
|
|
; EG-NEXT: MOV T29.X, PV.W,
|
|
; EG-NEXT: MOV * T0.Y, T24.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T35.Z, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T24.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHL * T1.W, T35.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T24.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T25.X,
|
|
; EG-NEXT: BFE_UINT * T1.W, T35.Z, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, T1.W,
|
|
; EG-NEXT: MOV * T25.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T1.W, T35.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T25.X, PV.Y,
|
|
; EG-NEXT: MOV * T0.Y, T20.X,
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T2.W, T35.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T20.X, PV.W,
|
|
; EG-NEXT: ALU clause starting at 225:
|
|
; EG-NEXT: MOV T0.Y, T20.X,
|
|
; EG-NEXT: LSHL * T1.W, T35.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T2.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; EG-NEXT: OR_INT * T1.W, PV.W, PS,
|
|
; EG-NEXT: MOV T20.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T21.X,
|
|
; EG-NEXT: BFE_UINT * T0.W, T35.W, literal.x, T0.W,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, T0.W,
|
|
; EG-NEXT: MOV * T21.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T0.W, T35.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T41.X, PS, literal.x,
|
|
; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.z,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; EG-NEXT: 16711680(2.341805e-38), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T42.X, PS, literal.x,
|
|
; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T21.X, PV.W,
|
|
; EG-NEXT: MOV * T36.X, T16.X,
|
|
; EG-NEXT: MOV * T36.Z, T12.X,
|
|
; EG-NEXT: MOV T37.X, T8.X,
|
|
; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
|
|
; EG-NEXT: MOV * T38.X, T32.X,
|
|
; EG-NEXT: MOV * T38.Z, T28.X,
|
|
; EG-NEXT: MOV T35.X, T24.X,
|
|
; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
|
|
;
|
|
; CM-LABEL: global_zextload_v32i8_to_v32i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @10
|
|
; CM-NEXT: ALU 101, @16, KC0[], KC1[]
|
|
; CM-NEXT: ALU 101, @118, KC0[], KC1[]
|
|
; CM-NEXT: ALU 40, @220, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T42.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T40.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T39.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 10:
|
|
; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
|
|
; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 14:
|
|
; CM-NEXT: MOV * T0.Y, T16.X,
|
|
; CM-NEXT: MOV * T35.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 16:
|
|
; CM-NEXT: AND_INT T0.Z, T37.X, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 255(3.573311e-43), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T0.W, T37.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T17.X,
|
|
; CM-NEXT: MOV * T0.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T37.X, literal.y, PV.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T17.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T37.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T35.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T17.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T12.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T37.Y, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T12.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T37.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T12.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T13.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T37.Y, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T13.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T37.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T35.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T13.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T8.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T37.Z, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T37.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T8.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T9.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T37.Z, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T9.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T37.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T37.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T9.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T4.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T37.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T37.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T5.X,
|
|
; CM-NEXT: AND_INT * T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: -65536(nan), 0(0.000000e+00)
|
|
; CM-NEXT: ALU clause starting at 118:
|
|
; CM-NEXT: BFE_UINT * T1.W, T37.W, literal.x, T0.W,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: OR_INT * T1.W, T0.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T37.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T37.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T5.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T32.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T36.X, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T32.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T36.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T32.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T33.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T36.X, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T33.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T36.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T38.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T33.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T28.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T36.Y, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T28.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T36.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T28.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T29.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T36.Y, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T29.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T36.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T38.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T29.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T24.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T36.Z, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T24.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHL * T1.W, T36.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T24.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T25.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T1.W, T36.Z, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T25.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T1.W, T36.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T36.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T25.X, PV.Y,
|
|
; CM-NEXT: MOV * T0.Y, T20.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, T36.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 255(3.573311e-43)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T20.X, PV.W,
|
|
; CM-NEXT: ALU clause starting at 220:
|
|
; CM-NEXT: MOV T0.Y, T20.X,
|
|
; CM-NEXT: LSHL * T1.W, T36.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T1.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16711680(2.341805e-38)
|
|
; CM-NEXT: OR_INT * T1.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T20.X, PV.W,
|
|
; CM-NEXT: MOV * T0.Y, T21.X,
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: BFE_UINT * T0.W, T36.W, literal.y, T0.W,
|
|
; CM-NEXT: -65536(nan), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T21.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T40.X, PV.W, literal.x,
|
|
; CM-NEXT: LSHR * T0.W, T36.W, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 8(1.121039e-44)
|
|
; CM-NEXT: LSHR T41.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: AND_INT T0.Y, T0.Y, literal.y,
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.w,
|
|
; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; CM-NEXT: 16711680(2.341805e-38), 16(2.242078e-44)
|
|
; CM-NEXT: LSHR T42.X, PV.W, literal.x,
|
|
; CM-NEXT: OR_INT * T36.W, PV.Y, PV.Z,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T21.X, PV.W,
|
|
; CM-NEXT: MOV T35.X, T16.X,
|
|
; CM-NEXT: MOV * T35.Z, T12.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T37.X, T8.X,
|
|
; CM-NEXT: MOV * T37.Z, T4.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T38.X, T32.X,
|
|
; CM-NEXT: MOV * T38.Z, T28.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T36.X, T24.X,
|
|
; CM-NEXT: MOV * T36.Z, T20.X, BS:VEC_120/SCL_212
|
|
%load = load <32 x i8>, ptr addrspace(1) %in
|
|
%ext = zext <32 x i8> %load to <32 x i16>
|
|
store <32 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; GCN-NOHSA-SI-LABEL: global_sextload_v32i8_to_v32i16:
|
|
; GCN-NOHSA-SI: ; %bb.0:
|
|
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s5, v3
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v0
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s7, v1
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v6
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s9, v7
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v4
|
|
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s11, v5
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s12, s11, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s13, s11, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s14, s11, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s11, s11
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s15, s10, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s16, s10, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s17, s10, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s10, s10
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s18, s9, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s19, s9, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s20, s9, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s9, s9
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s21, s8, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s22, s8, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s23, s8, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s8, s8
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s24, s7, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s25, s7, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s26, s7, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s7, s7
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s27, s6, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s28, s6, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s29, s6, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s6, s6
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s30, s5, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s31, s5, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s33, s5, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s5, s5
|
|
; GCN-NOHSA-SI-NEXT: s_ashr_i32 s34, s4, 24
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s35, s4, 0x80010
|
|
; GCN-NOHSA-SI-NEXT: s_bfe_i32 s36, s4, 0x80008
|
|
; GCN-NOHSA-SI-NEXT: s_sext_i32_i8 s4, s4
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s12, s12, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s13, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s11, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s17, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s18, s18, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s19, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s20, s20, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s9, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s21, s21, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s22, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s23, s23, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s24, s24, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s25, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s26, s26, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s7, s7, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s27, s27, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s28, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s29, s29, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s30, s30, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s31, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s33, s33, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s5, s5, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s34, s34, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s35, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s36, s36, 16
|
|
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xffff
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s12, s13, s12
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s11, s14
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s16, s15
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s10, s10, s17
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s14, s19, s18
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s9, s20
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s22, s21
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s8, s23
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s16, s25, s24
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s7, s26
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s28, s27
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s18, s31, s30
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s5, s33
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s35, s34
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s4, s4, s36
|
|
; GCN-NOHSA-SI-NEXT: s_or_b32 s6, s6, s29
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s19
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s6
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s17
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s16
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s15
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s10
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s13
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s11
|
|
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s12
|
|
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-SI-NEXT: s_endpgm
|
|
;
|
|
; GCN-HSA-LABEL: global_sextload_v32i8_to_v32i16:
|
|
; GCN-HSA: ; %bb.0:
|
|
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
|
|
; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
|
|
; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
|
; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
|
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
|
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s3, v3
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s5, v1
|
|
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s21, v5
|
|
; GCN-HSA-NEXT: s_ashr_i32 s6, s5, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s7, s5, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s8, s5, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s5, s5
|
|
; GCN-HSA-NEXT: s_ashr_i32 s9, s4, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s10, s4, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s11, s4, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s4, s4
|
|
; GCN-HSA-NEXT: s_ashr_i32 s12, s3, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s13, s3, 0x80010
|
|
; GCN-HSA-NEXT: s_bfe_i32 s14, s3, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s3, s3
|
|
; GCN-HSA-NEXT: s_ashr_i32 s15, s2, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s16, s2, 0x80010
|
|
; GCN-HSA-NEXT: s_ashr_i32 s22, s21, 24
|
|
; GCN-HSA-NEXT: s_bfe_i32 s23, s21, 0x80010
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s18, v6
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s19, v7
|
|
; GCN-HSA-NEXT: v_readfirstlane_b32 s20, v4
|
|
; GCN-HSA-NEXT: s_lshl_b32 s6, s6, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s5, s5, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s9, s9, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s10, s10, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s11, s11, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s13, s13, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s3, s3, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s22, s22, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s23, s23, 0xffff
|
|
; GCN-HSA-NEXT: s_bfe_i32 s17, s2, 0x80008
|
|
; GCN-HSA-NEXT: s_bfe_i32 s24, s21, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s21, s21
|
|
; GCN-HSA-NEXT: s_ashr_i32 s25, s20, 24
|
|
; GCN-HSA-NEXT: s_or_b32 s6, s7, s6
|
|
; GCN-HSA-NEXT: s_bfe_i32 s7, s20, 0x80010
|
|
; GCN-HSA-NEXT: s_or_b32 s5, s5, s8
|
|
; GCN-HSA-NEXT: s_bfe_i32 s8, s20, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s20, s20
|
|
; GCN-HSA-NEXT: s_or_b32 s9, s10, s9
|
|
; GCN-HSA-NEXT: s_ashr_i32 s10, s19, 24
|
|
; GCN-HSA-NEXT: s_or_b32 s4, s4, s11
|
|
; GCN-HSA-NEXT: s_bfe_i32 s11, s19, 0x80010
|
|
; GCN-HSA-NEXT: s_or_b32 s12, s13, s12
|
|
; GCN-HSA-NEXT: s_bfe_i32 s13, s19, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s19, s19
|
|
; GCN-HSA-NEXT: s_or_b32 s3, s3, s14
|
|
; GCN-HSA-NEXT: s_ashr_i32 s14, s18, 24
|
|
; GCN-HSA-NEXT: s_or_b32 s15, s16, s15
|
|
; GCN-HSA-NEXT: s_bfe_i32 s16, s18, 0x80010
|
|
; GCN-HSA-NEXT: s_or_b32 s22, s23, s22
|
|
; GCN-HSA-NEXT: s_bfe_i32 s23, s18, 0x80008
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s18, s18
|
|
; GCN-HSA-NEXT: s_sext_i32_i8 s2, s2
|
|
; GCN-HSA-NEXT: s_lshl_b32 s17, s17, 16
|
|
; GCN-HSA-NEXT: s_lshl_b32 s24, s24, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s21, s21, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s25, s25, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s7, s7, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s20, s20, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s10, s10, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s11, s11, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s19, s19, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s16, s16, 0xffff
|
|
; GCN-HSA-NEXT: s_lshl_b32 s23, s23, 16
|
|
; GCN-HSA-NEXT: s_and_b32 s18, s18, 0xffff
|
|
; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xffff
|
|
; GCN-HSA-NEXT: s_or_b32 s21, s21, s24
|
|
; GCN-HSA-NEXT: s_or_b32 s7, s7, s25
|
|
; GCN-HSA-NEXT: s_or_b32 s8, s20, s8
|
|
; GCN-HSA-NEXT: s_or_b32 s10, s11, s10
|
|
; GCN-HSA-NEXT: s_or_b32 s11, s19, s13
|
|
; GCN-HSA-NEXT: s_or_b32 s13, s16, s14
|
|
; GCN-HSA-NEXT: s_or_b32 s14, s18, s23
|
|
; GCN-HSA-NEXT: s_or_b32 s2, s2, s17
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s15
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s6
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
|
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s10
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
|
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s7
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s21
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22
|
|
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
|
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
|
; GCN-HSA-NEXT: s_endpgm
|
|
;
|
|
; GCN-NOHSA-VI-LABEL: global_sextload_v32i8_to_v32i16:
|
|
; GCN-NOHSA-VI: ; %bb.0:
|
|
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
|
|
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
|
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v1
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s7, 16
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s28, s7
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s29, s7, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s30, s6
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s31, s6, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s7, s7, 16
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s6, s6, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s14, s14, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v3
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s15, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, 0xffff, s14
|
|
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v4
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v5
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s14, s6
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s14, s5
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s20, s9
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s22, s8
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s15, s7
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s15, s5, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s21, s9, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s23, s8, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s20, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s22, s22, 8
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, 0xffff, s15
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v6
|
|
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s11, v7
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s12, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s13, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s21, 0xffff, s21
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s23, 0xffff, s23
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s22, s22, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s15, s14
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s15, s4
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s5, s5, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s10, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s11, 16
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s24, s11
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s25, s11, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s11, s11, 16
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s20, s21, s20
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s21, s23, s22
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s22, s4, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s13, s13, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s4, s4, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s12, s12, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s18, s8, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s19, s9, 16
|
|
; GCN-NOHSA-VI-NEXT: s_sext_i32_i16 s26, s10
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s27, s10, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s10, s10, 24
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 8
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s17, s17, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s16, s16, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s28, s28, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s30, 8
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s22, 0xffff, s22
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, 0xffff, s13
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s12, 0xffff, s12
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s9, s9, 24
|
|
; GCN-NOHSA-VI-NEXT: s_ashr_i32 s8, s8, 24
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s19, s19, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_bfe_i32 s18, s18, 0x80000
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s24, s24, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s26, s26, 8
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s29, 0xffff, s29
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s31, 0xffff, s31
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s17, 0xffff, s17
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, 0xffff, s16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s28, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s30, s30, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s22, s15
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s13, s5
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s12, s4
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16
|
|
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s25, 0xffff, s25
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s27, 0xffff, s27
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, 0xffff, s19
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s18, 0xffff, s18
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s24, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s26, 0xffff0000
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s17, s11
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s16, s10
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s29, s28
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s31, s30
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s15
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s14
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s19, s9
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s18, s8
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s25, s24
|
|
; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s27, s26
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
|
; GCN-NOHSA-VI-NEXT: s_nop 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s17
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s6
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s16
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
|
; GCN-NOHSA-VI-NEXT: s_nop 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s18
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s11
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
|
; GCN-NOHSA-VI-NEXT: s_nop 0
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s21
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s8
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s20
|
|
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s9
|
|
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
|
; GCN-NOHSA-VI-NEXT: s_endpgm
|
|
;
|
|
; EG-LABEL: global_sextload_v32i8_to_v32i16:
|
|
; EG: ; %bb.0:
|
|
; EG-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: TEX 1 @10
|
|
; EG-NEXT: ALU 104, @16, KC0[], KC1[]
|
|
; EG-NEXT: ALU 104, @121, KC0[], KC1[]
|
|
; EG-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[]
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T42.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T41.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T40.X, 0
|
|
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T39.X, 1
|
|
; EG-NEXT: CF_END
|
|
; EG-NEXT: Fetch clause starting at 10:
|
|
; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
|
|
; EG-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
|
|
; EG-NEXT: ALU clause starting at 14:
|
|
; EG-NEXT: MOV * T0.Y, T16.X,
|
|
; EG-NEXT: MOV * T35.X, KC0[2].Z,
|
|
; EG-NEXT: ALU clause starting at 16:
|
|
; EG-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, T0.Y, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; EG-NEXT: OR_INT * T0.W, PS, PV.W,
|
|
; EG-NEXT: MOV * T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T16.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T17.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T17.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T37.X, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T36.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T17.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T12.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T12.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T13.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T13.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T37.Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T36.W, PV.W, PS,
|
|
; EG-NEXT: MOV T13.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T8.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T8.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T9.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T9.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T37.Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: ALU clause starting at 121:
|
|
; EG-NEXT: OR_INT * T37.Y, T1.W, T0.W,
|
|
; EG-NEXT: MOV T9.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T4.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T4.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T5.X,
|
|
; EG-NEXT: LSHR * T0.W, T37.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T37.W, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T37.W, PV.W, PS,
|
|
; EG-NEXT: MOV T5.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T32.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T32.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.X, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T32.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T33.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T33.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T35.X, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T38.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T33.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T28.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T28.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T28.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T29.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T29.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T35.Y, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: ALU clause starting at 226:
|
|
; EG-NEXT: AND_INT T1.W, T0.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, T0.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T38.W, PV.W, PS,
|
|
; EG-NEXT: MOV T29.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T24.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T24.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T24.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T25.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.Z, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T25.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ASHR * T0.W, T35.Z, literal.x,
|
|
; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; EG-NEXT: OR_INT * T35.Y, PV.W, PS,
|
|
; EG-NEXT: MOV T25.X, PV.Y,
|
|
; EG-NEXT: MOV T0.Y, T20.X,
|
|
; EG-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: AND_INT T1.W, PV.Y, literal.x,
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; EG-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; EG-NEXT: OR_INT * T0.W, PV.W, PS,
|
|
; EG-NEXT: MOV * T20.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
|
|
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), 65535(9.183409e-41)
|
|
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV T20.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, T21.X,
|
|
; EG-NEXT: LSHR * T0.W, T35.W, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: BFE_INT T0.W, PV.W, 0.0, literal.x,
|
|
; EG-NEXT: AND_INT * T1.W, PV.Y, literal.y,
|
|
; EG-NEXT: 8(1.121039e-44), -65536(nan)
|
|
; EG-NEXT: AND_INT * T0.W, PV.W, literal.x,
|
|
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
|
|
; EG-NEXT: OR_INT * T0.W, T1.W, PV.W,
|
|
; EG-NEXT: MOV * T21.X, PV.W,
|
|
; EG-NEXT: MOV T0.Y, PV.X,
|
|
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; EG-NEXT: LSHR * T40.X, KC0[2].Y, literal.x,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: ASHR T0.W, T35.W, literal.x,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
|
|
; EG-NEXT: 24(3.363116e-44), 48(6.726233e-44)
|
|
; EG-NEXT: LSHR T41.X, PS, literal.x,
|
|
; EG-NEXT: AND_INT T0.Z, T0.Y, literal.y,
|
|
; EG-NEXT: LSHL T0.W, PV.W, literal.z,
|
|
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.w,
|
|
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
|
|
; EG-NEXT: LSHR T42.X, PS, literal.x,
|
|
; EG-NEXT: OR_INT * T35.W, PV.Z, PV.W,
|
|
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; EG-NEXT: MOV T21.X, PV.W,
|
|
; EG-NEXT: MOV * T36.X, T16.X,
|
|
; EG-NEXT: MOV * T36.Z, T12.X,
|
|
; EG-NEXT: MOV T37.X, T8.X,
|
|
; EG-NEXT: MOV T37.Z, T4.X, BS:VEC_120/SCL_212
|
|
; EG-NEXT: MOV * T38.X, T32.X,
|
|
; EG-NEXT: MOV * T38.Z, T28.X,
|
|
; EG-NEXT: MOV T35.X, T24.X,
|
|
; EG-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
|
|
;
|
|
; CM-LABEL: global_sextload_v32i8_to_v32i16:
|
|
; CM: ; %bb.0:
|
|
; CM-NEXT: ALU 1, @14, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: TEX 1 @10
|
|
; CM-NEXT: ALU 104, @16, KC0[], KC1[]
|
|
; CM-NEXT: ALU 104, @121, KC0[], KC1[]
|
|
; CM-NEXT: ALU 95, @226, KC0[CB0:0-32], KC1[]
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T42.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T41.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T40.X
|
|
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T39.X
|
|
; CM-NEXT: CF_END
|
|
; CM-NEXT: Fetch clause starting at 10:
|
|
; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 16, #1
|
|
; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 0, #1
|
|
; CM-NEXT: ALU clause starting at 14:
|
|
; CM-NEXT: MOV * T0.Y, T16.X,
|
|
; CM-NEXT: MOV * T35.X, KC0[2].Z,
|
|
; CM-NEXT: ALU clause starting at 16:
|
|
; CM-NEXT: BFE_INT * T0.W, T37.X, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.W, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, T0.Y, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), -65536(nan)
|
|
; CM-NEXT: OR_INT * T0.W, PV.W, PV.Z,
|
|
; CM-NEXT: MOV * T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T16.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T17.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.X, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T17.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T37.X, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T36.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T17.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T12.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T37.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T12.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T12.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T13.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.Y, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T13.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T37.Y, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T36.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T13.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T8.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T37.Z, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T8.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T9.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.Z, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T9.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T37.Z, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: ALU clause starting at 121:
|
|
; CM-NEXT: OR_INT * T37.Y, T0.Z, T0.W,
|
|
; CM-NEXT: MOV T9.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T4.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T37.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T4.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T5.X,
|
|
; CM-NEXT: LSHR * T0.W, T37.W, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T37.W, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T37.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T5.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T32.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T35.X, 0.0, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T32.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.X, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T32.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T33.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.X, literal.x, BS:VEC_120/SCL_212
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T33.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T35.X, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T38.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T33.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T28.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T35.Y, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T28.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.Y, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T28.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T29.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.Y, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T29.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T35.Y, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: ALU clause starting at 226:
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, T0.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T38.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T29.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T24.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T35.Z, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T24.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.Z, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T24.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T25.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.Z, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T25.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ASHR * T0.W, T35.Z, literal.x,
|
|
; CM-NEXT: 24(3.363116e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T35.Y, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T25.X, PV.Y,
|
|
; CM-NEXT: MOV T0.Y, T20.X,
|
|
; CM-NEXT: BFE_INT * T0.W, T35.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, PV.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T20.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.W, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV T20.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, T21.X,
|
|
; CM-NEXT: LSHR * T0.W, T35.W, literal.x,
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: BFE_INT * T0.W, PV.W, 0.0, literal.x,
|
|
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
|
|
; CM-NEXT: AND_INT T0.Z, T0.Y, literal.x,
|
|
; CM-NEXT: AND_INT * T0.W, PV.W, literal.y,
|
|
; CM-NEXT: -65536(nan), 65535(9.183409e-41)
|
|
; CM-NEXT: OR_INT * T0.W, PV.Z, PV.W,
|
|
; CM-NEXT: MOV * T21.X, PV.W,
|
|
; CM-NEXT: MOV T0.Y, PV.X,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
|
|
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
|
|
; CM-NEXT: LSHR T40.X, PV.W, literal.x,
|
|
; CM-NEXT: ASHR * T0.W, T35.W, literal.y,
|
|
; CM-NEXT: 2(2.802597e-45), 24(3.363116e-44)
|
|
; CM-NEXT: LSHR T41.X, KC0[2].Y, literal.x,
|
|
; CM-NEXT: AND_INT T0.Y, T0.Y, literal.y,
|
|
; CM-NEXT: LSHL T0.Z, PV.W, literal.z,
|
|
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
|
|
; CM-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
|
|
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
|
|
; CM-NEXT: LSHR T42.X, PV.W, literal.x,
|
|
; CM-NEXT: OR_INT * T35.W, PV.Y, PV.Z,
|
|
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
|
|
; CM-NEXT: MOV * T21.X, PV.W,
|
|
; CM-NEXT: MOV T36.X, T16.X,
|
|
; CM-NEXT: MOV * T36.Z, T12.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T37.X, T8.X,
|
|
; CM-NEXT: MOV * T37.Z, T4.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T38.X, T32.X,
|
|
; CM-NEXT: MOV * T38.Z, T28.X, BS:VEC_120/SCL_212
|
|
; CM-NEXT: MOV T35.X, T24.X,
|
|
; CM-NEXT: MOV * T35.Z, T20.X, BS:VEC_120/SCL_212
|
|
%load = load <32 x i8>, ptr addrspace(1) %in
|
|
%ext = sext <32 x i8> %load to <32 x i16>
|
|
store <32 x i16> %ext, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
|
|
; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; %load = load <64 x i8>, ptr addrspace(1) %in
|
|
; %ext = zext <64 x i8> %load to <64 x i16>
|
|
; store <64 x i16> %ext, ptr addrspace(1) %out
|
|
; ret void
|
|
; }
|
|
|
|
; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
|
|
; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
|
; %load = load <64 x i8>, ptr addrspace(1) %in
|
|
; %ext = sext <64 x i8> %load to <64 x i16>
|
|
; store <64 x i16> %ext, ptr addrspace(1) %out
|
|
; ret void
|
|
; }
|
|
|
|
attributes #0 = { nounwind }
|