[AMDGPU] Add pattern to select scalar ops for fshr with uniform operands (#165295)
Reasoning behind proposed change. This helps us move away from selecting
v_alignbits for fshr with uniform operands.
V_ALIGNBIT is defined in the ISA as:
D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0]) & 0xffffffffLL)
Note: S0 carries the MSBs and S1 carries the LSBs of the value being
aligned.
I interpret that as : concat (s0, s1) >> S2, and use the 0X1F mask to
return the lower 32 bits.
fshr:
fshr i32 %src0, i32 %src1, i32 %src2
Where:
concat(%src0, %src1) represents the 64-bit value formed by %src0 as the
high 32 bits and %src1 as the low 32 bits.
%src2 is the shift amount.
Only the lower 32 bits are returned.
So these two are identical.
So, I can expand the V_ALIGNBIT through bit manipulation as:
Concat: S1 | (S0 << 32)
Shift: ((S1 | (S0 << 32)) >> S2)
Break the shift: (S1>>S2) | (S0 << (32 – S2)
The proposed pattern does exactly this.
Additionally, src2 in the fshr pattern should be:
* must be 0–31.
* If the shift is ≥32, hardware semantics differ; you must handle it
with extra instructions.
The extra S_ANDs limit the selection only to the last 5 bits
This commit is contained in:
parent
ea56ca2da3
commit
b8add3710d
@ -791,6 +791,17 @@ def : GCNPat<
|
||||
(SI_CALL_ISEL $src0, (i64 0))
|
||||
>;
|
||||
|
||||
// Funnel shift right (fshr) patterns for uniform inputs.
|
||||
// These patterns implement this using scalar instructions by constructing a 64-bit
|
||||
// value {a, b} and performing a single right shift.
|
||||
def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, i32:$src2),
|
||||
(i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), (S_AND_B32 $src2, (i32 31))), sub0))
|
||||
>;
|
||||
|
||||
def : GCNPat<(UniformTernaryFrag<fshr> i32:$src0, i32:$src1, (i32 ShiftAmt32Imm:$src2)),
|
||||
(i32 (EXTRACT_SUBREG (S_LSHR_B64 (REG_SEQUENCE SReg_64, $src1, sub0, $src0, sub1), $src2), sub0))
|
||||
>;
|
||||
|
||||
// Wrapper around s_swappc_b64 with extra $callee parameter to track
|
||||
// the called function after regalloc.
|
||||
def SI_CALL : SPseudoInstSI <
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1392,20 +1392,20 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB15_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16
|
||||
; SI-NEXT: s_cbranch_execnz .LBB15_3
|
||||
; SI-NEXT: .LBB15_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: .LBB15_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB15_4:
|
||||
@ -1421,24 +1421,24 @@ define inreg i32 @bitcast_v2bf16_to_i32_scalar(<2 x bfloat> inreg %a, i32 inreg
|
||||
; VI-NEXT: s_cbranch_execnz .LBB15_4
|
||||
; VI-NEXT: .LBB15_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB15_3:
|
||||
; VI-NEXT: s_branch .LBB15_2
|
||||
@ -3671,20 +3671,20 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB35_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16
|
||||
; SI-NEXT: s_cbranch_execnz .LBB35_3
|
||||
; SI-NEXT: .LBB35_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: .LBB35_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB35_4:
|
||||
@ -3700,24 +3700,24 @@ define inreg float @bitcast_v2bf16_to_f32_scalar(<2 x bfloat> inreg %a, i32 inre
|
||||
; VI-NEXT: s_cbranch_execnz .LBB35_4
|
||||
; VI-NEXT: .LBB35_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB35_3:
|
||||
; VI-NEXT: s_branch .LBB35_2
|
||||
@ -5581,24 +5581,25 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s17
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB51_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v3
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v3
|
||||
; SI-NEXT: s_cbranch_execnz .LBB51_3
|
||||
; SI-NEXT: .LBB51_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v3
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_lshr_b64 v[2:3], v[0:1], 16
|
||||
; SI-NEXT: .LBB51_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v2
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB51_4:
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: s_branch .LBB51_2
|
||||
;
|
||||
@ -5611,24 +5612,24 @@ define inreg <2 x i16> @bitcast_v2bf16_to_v2i16_scalar(<2 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB51_4
|
||||
; VI-NEXT: .LBB51_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB51_3:
|
||||
; VI-NEXT: s_branch .LBB51_2
|
||||
@ -7278,24 +7279,24 @@ define inreg <2 x half> @bitcast_v2bf16_to_v2f16_scalar(<2 x bfloat> inreg %a, i
|
||||
; VI-NEXT: s_cbranch_execnz .LBB63_4
|
||||
; VI-NEXT: .LBB63_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB63_3:
|
||||
; VI-NEXT: s_branch .LBB63_2
|
||||
@ -8720,20 +8721,20 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s16
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB73_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v4
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[2:3], 16
|
||||
; SI-NEXT: s_cbranch_execnz .LBB73_3
|
||||
; SI-NEXT: .LBB73_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: .LBB73_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB73_4:
|
||||
@ -8749,24 +8750,24 @@ define inreg <1 x i32> @bitcast_v2bf16_to_v1i32_scalar(<2 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB73_4
|
||||
; VI-NEXT: .LBB73_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v1, 16
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB73_3:
|
||||
; VI-NEXT: s_branch .LBB73_2
|
||||
@ -9336,30 +9337,31 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s18, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB77_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
|
||||
; SI-NEXT: v_alignbit_b32 v0, v2, v5, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
|
||||
; SI-NEXT: s_cbranch_execnz .LBB77_3
|
||||
; SI-NEXT: .LBB77_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v3
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
|
||||
; SI-NEXT: .LBB77_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v5
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB77_4:
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-NEXT: s_branch .LBB77_2
|
||||
;
|
||||
@ -9369,9 +9371,9 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
|
||||
; VI-NEXT: s_cmp_lg_u32 s17, 0
|
||||
; VI-NEXT: s_cbranch_scc0 .LBB77_3
|
||||
; VI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; VI-NEXT: s_lshr_b32 s7, s16, 24
|
||||
; VI-NEXT: s_lshr_b32 s6, s16, 16
|
||||
; VI-NEXT: s_lshr_b32 s8, s16, 8
|
||||
; VI-NEXT: s_lshr_b32 s6, s16, 24
|
||||
; VI-NEXT: s_lshr_b32 s8, s16, 16
|
||||
; VI-NEXT: s_lshr_b32 s7, s16, 8
|
||||
; VI-NEXT: s_cbranch_execnz .LBB77_4
|
||||
; VI-NEXT: .LBB77_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
@ -9392,21 +9394,21 @@ define inreg <4 x i8> @bitcast_v2bf16_to_v4i8_scalar(<2 x bfloat> inreg %a, i32
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v4, v2, v1, 16
|
||||
; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[1:2]
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 24, v4
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v4
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB77_3:
|
||||
; VI-NEXT: ; implicit-def: $sgpr7
|
||||
; VI-NEXT: ; implicit-def: $sgpr8
|
||||
; VI-NEXT: ; implicit-def: $sgpr6
|
||||
; VI-NEXT: ; implicit-def: $sgpr7
|
||||
; VI-NEXT: s_branch .LBB77_2
|
||||
; VI-NEXT: .LBB77_4:
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s16
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; GFX9-LABEL: bitcast_v2bf16_to_v4i8_scalar:
|
||||
|
||||
@ -290,34 +290,34 @@ define inreg <3 x half> @bitcast_v3bf16_to_v3f16_scalar(<3 x bfloat> inreg %a, i
|
||||
; VI-NEXT: s_cbranch_execnz .LBB1_4
|
||||
; VI-NEXT: .LBB1_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB1_3:
|
||||
; VI-NEXT: s_branch .LBB1_2
|
||||
@ -964,16 +964,16 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
|
||||
; SI-NEXT: s_cbranch_execnz .LBB5_3
|
||||
; SI-NEXT: .LBB5_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[1:2], 16
|
||||
; SI-NEXT: v_alignbit_b32 v0, v5, v0, 16
|
||||
; SI-NEXT: .LBB5_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
@ -992,34 +992,34 @@ define inreg <3 x i16> @bitcast_v3bf16_to_v3i16_scalar(<3 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB5_4
|
||||
; VI-NEXT: .LBB5_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
|
||||
; VI-NEXT: v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB5_3:
|
||||
; VI-NEXT: s_branch .LBB5_2
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -2272,30 +2272,32 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB23_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
|
||||
; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
|
||||
; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v8
|
||||
; SI-NEXT: s_cbranch_execnz .LBB23_3
|
||||
; SI-NEXT: .LBB23_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; SI-NEXT: .LBB23_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB23_4:
|
||||
@ -2311,42 +2313,43 @@ define inreg i64 @bitcast_v4bf16_to_i64_scalar(<4 x bfloat> inreg %a, i32 inreg
|
||||
; VI-NEXT: s_cbranch_execnz .LBB23_4
|
||||
; VI-NEXT: .LBB23_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB23_3:
|
||||
; VI-NEXT: s_branch .LBB23_2
|
||||
@ -5460,30 +5463,32 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB47_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
|
||||
; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
|
||||
; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v8
|
||||
; SI-NEXT: s_cbranch_execnz .LBB47_3
|
||||
; SI-NEXT: .LBB47_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; SI-NEXT: .LBB47_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB47_4:
|
||||
@ -5499,42 +5504,43 @@ define inreg double @bitcast_v4bf16_to_f64_scalar(<4 x bfloat> inreg %a, i32 inr
|
||||
; VI-NEXT: s_cbranch_execnz .LBB47_4
|
||||
; VI-NEXT: .LBB47_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB47_3:
|
||||
; VI-NEXT: s_branch .LBB47_2
|
||||
@ -8361,30 +8367,32 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB67_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
|
||||
; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
|
||||
; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v8
|
||||
; SI-NEXT: s_cbranch_execnz .LBB67_3
|
||||
; SI-NEXT: .LBB67_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; SI-NEXT: .LBB67_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB67_4:
|
||||
@ -8400,42 +8408,43 @@ define inreg <2 x i32> @bitcast_v4bf16_to_v2i32_scalar(<4 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB67_4
|
||||
; VI-NEXT: .LBB67_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB67_3:
|
||||
; VI-NEXT: s_branch .LBB67_2
|
||||
@ -10937,30 +10946,32 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB83_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v5, 16
|
||||
; SI-NEXT: v_alignbit_b32 v1, v1, v3, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v7
|
||||
; SI-NEXT: v_lshr_b64 v[8:9], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[4:5], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v8
|
||||
; SI-NEXT: s_cbranch_execnz .LBB83_3
|
||||
; SI-NEXT: .LBB83_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v5
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; SI-NEXT: .LBB83_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB83_4:
|
||||
@ -10976,42 +10987,43 @@ define inreg <2 x float> @bitcast_v4bf16_to_v2f32_scalar(<4 x bfloat> inreg %a,
|
||||
; VI-NEXT: s_cbranch_execnz .LBB83_4
|
||||
; VI-NEXT: .LBB83_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v2, 16
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v4
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB83_3:
|
||||
; VI-NEXT: s_branch .LBB83_2
|
||||
@ -13151,37 +13163,38 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
|
||||
; SI-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB95_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v6
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v5
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
|
||||
; SI-NEXT: s_cbranch_execnz .LBB95_3
|
||||
; SI-NEXT: .LBB95_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
|
||||
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v6, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[1:2], 16
|
||||
; SI-NEXT: v_alignbit_b32 v0, v6, v0, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[2:3], 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v4
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; SI-NEXT: .LBB95_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v4
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB95_4:
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-NEXT: s_branch .LBB95_2
|
||||
;
|
||||
@ -13194,42 +13207,43 @@ define inreg <4 x i16> @bitcast_v4bf16_to_v4i16_scalar(<4 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB95_4
|
||||
; VI-NEXT: .LBB95_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16
|
||||
; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3]
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB95_3:
|
||||
; VI-NEXT: s_branch .LBB95_2
|
||||
@ -15062,42 +15076,43 @@ define inreg <4 x half> @bitcast_v4bf16_to_v4f16_scalar(<4 x bfloat> inreg %a, i
|
||||
; VI-NEXT: s_cbranch_execnz .LBB103_4
|
||||
; VI-NEXT: .LBB103_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v1, v0, v1, 16
|
||||
; VI-NEXT: v_alignbit_b32 v0, v3, v2, 16
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3]
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v2
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB103_3:
|
||||
; VI-NEXT: s_branch .LBB103_2
|
||||
@ -16737,52 +16752,54 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s20, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v12, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB109_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v11
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
|
||||
; SI-NEXT: v_alignbit_b32 v9, v1, v12, 16
|
||||
; SI-NEXT: v_alignbit_b32 v10, v6, v8, 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v14
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v15
|
||||
; SI-NEXT: v_lshr_b64 v[10:11], v[5:6], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[12:13], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v10
|
||||
; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v14
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v10
|
||||
; SI-NEXT: s_cbranch_execnz .LBB109_3
|
||||
; SI-NEXT: .LBB109_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v11
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v12
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v14
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v15
|
||||
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v12
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; SI-NEXT: v_alignbit_b32 v9, v2, v1, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
|
||||
; SI-NEXT: v_alignbit_b32 v10, v6, v1, 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[9:10], 24
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[9:10], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[9:10], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v10
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_lshr_b64 v[10:11], v[5:6], 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v10
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24
|
||||
; SI-NEXT: v_lshr_b64 v[8:9], v[0:1], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v10
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7
|
||||
; SI-NEXT: .LBB109_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v9
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v4, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v5, v9
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB109_4:
|
||||
; SI-NEXT: ; implicit-def: $vgpr9
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-NEXT: ; implicit-def: $vgpr8
|
||||
; SI-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-NEXT: ; implicit-def: $vgpr10
|
||||
; SI-NEXT: ; implicit-def: $vgpr9
|
||||
; SI-NEXT: ; implicit-def: $vgpr7
|
||||
; SI-NEXT: s_branch .LBB109_2
|
||||
;
|
||||
@ -16793,11 +16810,11 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
|
||||
; VI-NEXT: s_cbranch_scc0 .LBB109_3
|
||||
; VI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
|
||||
; VI-NEXT: s_lshr_b32 s8, s17, 24
|
||||
; VI-NEXT: s_lshr_b32 s5, s17, 16
|
||||
; VI-NEXT: s_lshr_b32 s10, s17, 8
|
||||
; VI-NEXT: s_lshr_b32 s9, s16, 16
|
||||
; VI-NEXT: s_lshr_b32 s11, s16, 8
|
||||
; VI-NEXT: s_lshr_b32 s5, s17, 24
|
||||
; VI-NEXT: s_lshr_b32 s11, s17, 16
|
||||
; VI-NEXT: s_lshr_b32 s8, s17, 8
|
||||
; VI-NEXT: s_lshr_b32 s10, s16, 16
|
||||
; VI-NEXT: s_lshr_b32 s9, s16, 8
|
||||
; VI-NEXT: s_cbranch_execnz .LBB109_4
|
||||
; VI-NEXT: .LBB109_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
@ -16810,58 +16827,59 @@ define inreg <8 x i8> @bitcast_v4bf16_to_v8i8_scalar(<4 x bfloat> inreg %a, i32
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v5, v2, v3, vcc
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[5:6]
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_bfe_u32 v3, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v0
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v2, v6, v3, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v0, v4, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v4
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[1:2]
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[9:10], 16, v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v10, v1
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v5
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[9:10]
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v9
|
||||
; VI-NEXT: v_mov_b32_e32 v4, v8
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB109_3:
|
||||
; VI-NEXT: ; implicit-def: $sgpr11
|
||||
; VI-NEXT: ; implicit-def: $sgpr9
|
||||
; VI-NEXT: ; implicit-def: $sgpr4
|
||||
; VI-NEXT: ; implicit-def: $sgpr10
|
||||
; VI-NEXT: ; implicit-def: $sgpr5
|
||||
; VI-NEXT: ; implicit-def: $sgpr4
|
||||
; VI-NEXT: ; implicit-def: $sgpr8
|
||||
; VI-NEXT: ; implicit-def: $sgpr11
|
||||
; VI-NEXT: ; implicit-def: $sgpr5
|
||||
; VI-NEXT: s_branch .LBB109_2
|
||||
; VI-NEXT: .LBB109_4:
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s11
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s16
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s11
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s17
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
||||
@ -2214,40 +2214,42 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s22, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20
|
||||
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB11_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16
|
||||
; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16
|
||||
; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[7:8], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
|
||||
; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v12
|
||||
; SI-NEXT: s_cbranch_execnz .LBB11_3
|
||||
; SI-NEXT: .LBB11_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
|
||||
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v3
|
||||
; SI-NEXT: .LBB11_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB11_4:
|
||||
@ -2263,60 +2265,61 @@ define inreg <3 x i32> @bitcast_v6bf16_to_v3i32_scalar(<6 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB11_4
|
||||
; VI-NEXT: .LBB11_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s18, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB11_3:
|
||||
; VI-NEXT: s_branch .LBB11_2
|
||||
@ -5430,40 +5433,42 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s22, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v6, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20
|
||||
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v3, 1.0, s20
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB27_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v7
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v5
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
|
||||
; SI-NEXT: v_alignbit_b32 v0, v0, v8, 16
|
||||
; SI-NEXT: v_alignbit_b32 v1, v1, v6, 16
|
||||
; SI-NEXT: v_alignbit_b32 v2, v2, v4, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v11
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[7:8], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
|
||||
; SI-NEXT: v_lshr_b64 v[12:13], v[3:4], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[5:6], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v12
|
||||
; SI-NEXT: s_cbranch_execnz .LBB27_3
|
||||
; SI-NEXT: .LBB27_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v8
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v11
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
|
||||
; SI-NEXT: v_and_b32_e32 v4, 0xffff0000, v9
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v4
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
|
||||
; SI-NEXT: v_alignbit_b32 v1, v2, v1, 16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
|
||||
; SI-NEXT: v_add_f32_e32 v3, 0x40c00000, v3
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v3
|
||||
; SI-NEXT: .LBB27_3: ; %end
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB27_4:
|
||||
@ -5479,60 +5484,61 @@ define inreg <3 x float> @bitcast_v6bf16_to_v3f32_scalar(<6 x bfloat> inreg %a,
|
||||
; VI-NEXT: s_cbranch_execnz .LBB27_4
|
||||
; VI-NEXT: .LBB27_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s18, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v5, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_alignbit_b32 v2, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v3, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v3, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v4, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v0
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v0
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v0, v3, 16
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v5
|
||||
; VI-NEXT: v_bfe_u32 v4, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB27_3:
|
||||
; VI-NEXT: s_branch .LBB27_2
|
||||
@ -8098,70 +8104,73 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
|
||||
; SI: ; %bb.0:
|
||||
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; SI-NEXT: s_cmp_lg_u32 s22, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v18, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v15, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v16, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v0, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v14, 1.0, s20
|
||||
; SI-NEXT: v_mul_f32_e64 v21, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v17, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v20, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v5, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v19, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB39_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v17
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v15
|
||||
; SI-NEXT: v_alignbit_b32 v12, v1, v18, 16
|
||||
; SI-NEXT: v_alignbit_b32 v13, v6, v16, 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16
|
||||
; SI-NEXT: v_alignbit_b32 v8, v10, v14, 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v15
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v20
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v21
|
||||
; SI-NEXT: v_lshr_b64 v[14:15], v[5:6], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v19
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[17:18], 16
|
||||
; SI-NEXT: v_lshr_b64 v[15:16], v[9:10], 16
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v14
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v20
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v19
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v14
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v15
|
||||
; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8
|
||||
; SI-NEXT: s_cbranch_execnz .LBB39_3
|
||||
; SI-NEXT: .LBB39_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v17
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v18
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; SI-NEXT: v_alignbit_b32 v12, v2, v1, 16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v15
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
|
||||
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v20
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v21
|
||||
; SI-NEXT: v_add_f32_e32 v7, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v17
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v7
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
|
||||
; SI-NEXT: v_alignbit_b32 v13, v6, v1, 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v14
|
||||
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v19
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[12:13], 24
|
||||
; SI-NEXT: v_alignbit_b32 v8, v10, v1, 16
|
||||
; SI-NEXT: v_lshr_b64 v[4:5], v[12:13], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[12:13], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v13
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v9, 8, v8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; SI-NEXT: v_lshr_b64 v[14:15], v[5:6], 16
|
||||
; SI-NEXT: v_add_f32_e32 v5, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v5
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v14
|
||||
; SI-NEXT: v_lshr_b64 v[15:16], v[9:10], 16
|
||||
; SI-NEXT: v_lshr_b64 v[3:4], v[0:1], 24
|
||||
; SI-NEXT: v_lshr_b64 v[12:13], v[0:1], 16
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[0:1], 8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v14
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v16, 8, v15
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 24, v7
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v11, 24, v5
|
||||
; SI-NEXT: .LBB39_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v0, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v4
|
||||
; SI-NEXT: v_mov_b32_e32 v4, v13
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v12
|
||||
; SI-NEXT: v_mov_b32_e32 v4, v14
|
||||
; SI-NEXT: v_mov_b32_e32 v5, v8
|
||||
; SI-NEXT: v_mov_b32_e32 v8, v15
|
||||
; SI-NEXT: v_mov_b32_e32 v9, v16
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB39_4:
|
||||
; SI-NEXT: ; implicit-def: $vgpr12
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-NEXT: ; implicit-def: $vgpr12
|
||||
; SI-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-NEXT: ; implicit-def: $vgpr7
|
||||
; SI-NEXT: ; implicit-def: $vgpr14
|
||||
; SI-NEXT: ; implicit-def: $vgpr8
|
||||
; SI-NEXT: ; implicit-def: $vgpr9
|
||||
; SI-NEXT: ; implicit-def: $vgpr10
|
||||
; SI-NEXT: ; implicit-def: $vgpr7
|
||||
; SI-NEXT: ; implicit-def: $vgpr15
|
||||
; SI-NEXT: ; implicit-def: $vgpr16
|
||||
; SI-NEXT: ; implicit-def: $vgpr11
|
||||
; SI-NEXT: s_branch .LBB39_2
|
||||
;
|
||||
@ -8171,110 +8180,110 @@ define inreg <12 x i8> @bitcast_v6bf16_to_v12i8_scalar(<6 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cmp_lg_u32 s19, 0
|
||||
; VI-NEXT: s_cbranch_scc0 .LBB39_3
|
||||
; VI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; VI-NEXT: s_lshr_b32 s19, s16, 8
|
||||
; VI-NEXT: s_lshr_b32 s10, s18, 16
|
||||
; VI-NEXT: s_lshr_b32 s11, s18, 8
|
||||
; VI-NEXT: s_lshr_b32 s19, s18, 16
|
||||
; VI-NEXT: s_lshr_b32 s15, s18, 8
|
||||
; VI-NEXT: s_lshr_b32 s12, s17, 24
|
||||
; VI-NEXT: s_lshr_b32 s13, s17, 16
|
||||
; VI-NEXT: s_lshr_b32 s15, s17, 8
|
||||
; VI-NEXT: s_lshr_b32 s11, s17, 16
|
||||
; VI-NEXT: s_lshr_b32 s10, s17, 8
|
||||
; VI-NEXT: s_lshr_b32 s14, s16, 16
|
||||
; VI-NEXT: s_lshr_b32 s13, s16, 8
|
||||
; VI-NEXT: s_lshr_b64 s[6:7], s[18:19], 24
|
||||
; VI-NEXT: s_lshr_b64 s[4:5], s[16:17], 24
|
||||
; VI-NEXT: s_cbranch_execnz .LBB39_4
|
||||
; VI-NEXT: .LBB39_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_alignbit_b32 v15, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v3, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v2
|
||||
; VI-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; VI-NEXT: s_lshl_b32 s4, s18, 16
|
||||
; VI-NEXT: v_alignbit_b32 v14, v2, v1, 16
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_bfe_u32 v2, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v0
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v8, v0, v1, 16
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[0:1]
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: s_lshl_b32 s4, s18, 16
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
|
||||
; VI-NEXT: v_bfe_u32 v5, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v3
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
|
||||
; VI-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_lshrrev_b64 v[8:9], 16, v[2:3]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v4
|
||||
; VI-NEXT: v_mov_b32_e32 v9, 0x7fc07fc0
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 24, v[14:15]
|
||||
; VI-NEXT: v_lshrrev_b64 v[14:15], 24, v[0:1]
|
||||
; VI-NEXT: v_lshrrev_b64 v[11:12], 24, v[8:9]
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v13, 8, v8
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v15
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v15
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v15
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v14
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v14
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v4
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
|
||||
; VI-NEXT: s_branch .LBB39_5
|
||||
; VI-NEXT: .LBB39_3:
|
||||
; VI-NEXT: ; implicit-def: $sgpr19
|
||||
; VI-NEXT: ; implicit-def: $sgpr13
|
||||
; VI-NEXT: ; implicit-def: $sgpr14
|
||||
; VI-NEXT: ; implicit-def: $sgpr4
|
||||
; VI-NEXT: ; implicit-def: $sgpr15
|
||||
; VI-NEXT: ; implicit-def: $sgpr13
|
||||
; VI-NEXT: ; implicit-def: $sgpr12
|
||||
; VI-NEXT: ; implicit-def: $sgpr11
|
||||
; VI-NEXT: ; implicit-def: $sgpr10
|
||||
; VI-NEXT: ; implicit-def: $sgpr11
|
||||
; VI-NEXT: ; implicit-def: $sgpr12
|
||||
; VI-NEXT: ; implicit-def: $sgpr15
|
||||
; VI-NEXT: ; implicit-def: $sgpr19
|
||||
; VI-NEXT: ; implicit-def: $sgpr6
|
||||
; VI-NEXT: s_branch .LBB39_2
|
||||
; VI-NEXT: .LBB39_4:
|
||||
; VI-NEXT: v_mov_b32_e32 v14, s16
|
||||
; VI-NEXT: v_mov_b32_e32 v15, s17
|
||||
; VI-NEXT: v_mov_b32_e32 v8, s18
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s19
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s16
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s17
|
||||
; VI-NEXT: v_mov_b32_e32 v10, s19
|
||||
; VI-NEXT: v_mov_b32_e32 v13, s15
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s14
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s15
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s13
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s13
|
||||
; VI-NEXT: v_mov_b32_e32 v7, s12
|
||||
; VI-NEXT: v_mov_b32_e32 v13, s11
|
||||
; VI-NEXT: v_mov_b32_e32 v10, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v6, s11
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v11, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v14, s4
|
||||
; VI-NEXT: .LBB39_5: ; %end
|
||||
; VI-NEXT: v_mov_b32_e32 v0, v14
|
||||
; VI-NEXT: v_mov_b32_e32 v4, v15
|
||||
; VI-NEXT: v_mov_b32_e32 v3, v14
|
||||
; VI-NEXT: v_mov_b32_e32 v9, v13
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
@ -11854,33 +11863,32 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
|
||||
; VI-NEXT: s_cbranch_execnz .LBB49_4
|
||||
; VI-NEXT: .LBB49_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
|
||||
@ -11889,25 +11897,27 @@ define inreg <6 x half> @bitcast_v6bf16_to_v6f16_scalar(<6 x bfloat> inreg %a, i
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s18, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
|
||||
; VI-NEXT: v_bfe_u32 v6, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0
|
||||
; VI-NEXT: v_bfe_u32 v6, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3]
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5]
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB49_3:
|
||||
; VI-NEXT: s_branch .LBB49_2
|
||||
@ -12814,49 +12824,51 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
|
||||
; SI-NEXT: s_cmp_lg_u32 s22, 0
|
||||
; SI-NEXT: v_mul_f32_e64 v11, 1.0, s16
|
||||
; SI-NEXT: v_mul_f32_e64 v10, 1.0, s17
|
||||
; SI-NEXT: v_mul_f32_e64 v7, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v1, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s20
|
||||
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s21
|
||||
; SI-NEXT: v_mul_f32_e64 v2, 1.0, s18
|
||||
; SI-NEXT: v_mul_f32_e64 v8, 1.0, s19
|
||||
; SI-NEXT: v_mul_f32_e64 v4, 1.0, s20
|
||||
; SI-NEXT: v_mul_f32_e64 v9, 1.0, s21
|
||||
; SI-NEXT: s_cbranch_scc0 .LBB53_4
|
||||
; SI-NEXT: ; %bb.1: ; %cmp.false
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v11
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v10
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v7
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v9
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v8
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v9
|
||||
; SI-NEXT: s_cbranch_execnz .LBB53_3
|
||||
; SI-NEXT: .LBB53_2: ; %cmp.true
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v10
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
|
||||
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v11
|
||||
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v10, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
|
||||
; SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v9
|
||||
; SI-NEXT: v_add_f32_e32 v9, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; SI-NEXT: v_and_b32_e32 v2, 0xffff0000, v7
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v10
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
|
||||
; SI-NEXT: v_lshr_b64 v[6:7], v[1:2], 16
|
||||
; SI-NEXT: v_alignbit_b32 v4, v5, v9, 16
|
||||
; SI-NEXT: v_add_f32_e32 v4, 0x40c00000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v9
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
|
||||
; SI-NEXT: v_add_f32_e32 v2, 0x40c00000, v1
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v8
|
||||
; SI-NEXT: v_add_f32_e32 v1, 0x40c00000, v1
|
||||
; SI-NEXT: v_lshr_b64 v[6:7], v[4:5], 16
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; SI-NEXT: v_lshr_b64 v[7:8], v[2:3], 16
|
||||
; SI-NEXT: v_and_b32_e32 v1, 0xffff0000, v10
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v7
|
||||
; SI-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; SI-NEXT: .LBB53_3: ; %end
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v6
|
||||
; SI-NEXT: v_mov_b32_e32 v2, v7
|
||||
; SI-NEXT: v_mov_b32_e32 v4, v6
|
||||
; SI-NEXT: s_setpc_b64 s[30:31]
|
||||
; SI-NEXT: .LBB53_4:
|
||||
; SI-NEXT: ; implicit-def: $vgpr0
|
||||
; SI-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-NEXT: ; implicit-def: $vgpr2
|
||||
; SI-NEXT: ; implicit-def: $vgpr1
|
||||
; SI-NEXT: ; implicit-def: $vgpr7
|
||||
; SI-NEXT: ; implicit-def: $vgpr3
|
||||
; SI-NEXT: ; implicit-def: $vgpr4
|
||||
; SI-NEXT: ; implicit-def: $vgpr6
|
||||
; SI-NEXT: ; implicit-def: $vgpr5
|
||||
; SI-NEXT: s_branch .LBB53_2
|
||||
;
|
||||
@ -12869,33 +12881,32 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: s_cbranch_execnz .LBB53_4
|
||||
; VI-NEXT: .LBB53_2: ; %cmp.true
|
||||
; VI-NEXT: s_lshl_b32 s4, s16, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, 0x40c00000
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v1, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v0
|
||||
; VI-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: s_and_b32 s4, s16, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v3, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v4, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s17, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v1
|
||||
; VI-NEXT: v_add_f32_e32 v1, s4, v0
|
||||
; VI-NEXT: v_bfe_u32 v2, v1, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v1
|
||||
; VI-NEXT: v_add_u32_e32 v2, vcc, 0x7fff, v2
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v1
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v4, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, v4, v2
|
||||
; VI-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; VI-NEXT: s_and_b32 s4, s17, 0xffff0000
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_or_b32_e32 v5, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
|
||||
; VI-NEXT: v_bfe_u32 v5, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v2
|
||||
; VI-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
|
||||
@ -12904,25 +12915,27 @@ define inreg <6 x i16> @bitcast_v6bf16_to_v6i16_scalar(<6 x bfloat> inreg %a, i3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
|
||||
; VI-NEXT: s_lshl_b32 s4, s18, 16
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v0
|
||||
; VI-NEXT: v_add_f32_e32 v2, s4, v3
|
||||
; VI-NEXT: v_bfe_u32 v6, v2, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v2
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; VI-NEXT: s_and_b32 s4, s18, 0xffff0000
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; VI-NEXT: v_add_f32_e32 v0, s4, v0
|
||||
; VI-NEXT: v_add_f32_e32 v3, s4, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
|
||||
; VI-NEXT: v_bfe_u32 v6, v0, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v0
|
||||
; VI-NEXT: v_bfe_u32 v6, v3, 16, 1
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v3
|
||||
; VI-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v0
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
|
||||
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v2, v0, v2, 16
|
||||
; VI-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; VI-NEXT: v_alignbit_b32 v0, v4, v3, 16
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v3
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; VI-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; VI-NEXT: v_lshrrev_b64 v[2:3], 16, v[2:3]
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_lshrrev_b64 v[3:4], 16, v[4:5]
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v3
|
||||
; VI-NEXT: s_setpc_b64 s[30:31]
|
||||
; VI-NEXT: .LBB53_3:
|
||||
; VI-NEXT: s_branch .LBB53_2
|
||||
|
||||
@ -27,24 +27,26 @@ define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(ptr addrspace(
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:1
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:3
|
||||
; GFX6-NEXT: s_lshr_b32 s8, s9, 16
|
||||
; GFX6-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX6-NEXT: s_lshr_b32 s7, s9, 16
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: s_lshl_b64 s[6:7], s[4:5], 8
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s11
|
||||
; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:9
|
||||
; GFX6-NEXT: s_lshl_b64 s[8:9], s[4:5], 8
|
||||
; GFX6-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s5
|
||||
; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:2
|
||||
; GFX6-NEXT: v_alignbit_b32 v0, s8, v0, 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:9
|
||||
; GFX6-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:12
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:2
|
||||
; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16
|
||||
; GFX6-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v1, 8, v0
|
||||
; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0
|
||||
; GFX6-NEXT: buffer_store_byte v1, off, s[0:3], 0 offset:5
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:12
|
||||
; GFX6-NEXT: s_lshr_b32 s5, s4, 8
|
||||
; GFX6-NEXT: s_lshr_b32 s4, s4, 24
|
||||
; GFX6-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:5
|
||||
; GFX6-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:7
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -46065,18 +46065,18 @@ define <32 x bfloat> @v_select_v32bf16(i1 %cond, <32 x bfloat> %a, <32 x bfloat>
|
||||
define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat> inreg %b, i32 %c) {
|
||||
; GCN-LABEL: s_select_v3bf16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
|
||||
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
|
||||
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4
|
||||
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s3
|
||||
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
|
||||
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
|
||||
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4
|
||||
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s3
|
||||
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
|
||||
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s5
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
|
||||
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
|
||||
; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
|
||||
; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
@ -46087,13 +46087,13 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
|
||||
; GFX7-LABEL: s_select_v3bf16:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
|
||||
; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
||||
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
|
||||
; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
|
||||
; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
||||
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s3
|
||||
; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
|
||||
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
|
||||
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s5
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
@ -46203,22 +46203,22 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
|
||||
define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat> inreg %b, i32 %c) {
|
||||
; GCN-LABEL: s_select_v4bf16:
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s1
|
||||
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s0
|
||||
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s5
|
||||
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s4
|
||||
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s3
|
||||
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s2
|
||||
; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s7
|
||||
; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s6
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v7
|
||||
; GCN-NEXT: v_alignbit_b32 v1, v1, v2, 16
|
||||
; GCN-NEXT: v_alignbit_b32 v2, v3, v4, 16
|
||||
; GCN-NEXT: v_alignbit_b32 v3, v5, v6, 16
|
||||
; GCN-NEXT: v_alignbit_b32 v4, v7, v8, 16
|
||||
; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
|
||||
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
|
||||
; GCN-NEXT: v_mul_f32_e64 v4, 1.0, s5
|
||||
; GCN-NEXT: v_mul_f32_e64 v3, 1.0, s4
|
||||
; GCN-NEXT: v_mul_f32_e64 v6, 1.0, s3
|
||||
; GCN-NEXT: v_mul_f32_e64 v5, 1.0, s2
|
||||
; GCN-NEXT: v_mul_f32_e64 v8, 1.0, s7
|
||||
; GCN-NEXT: v_mul_f32_e64 v7, 1.0, s6
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v6
|
||||
; GCN-NEXT: v_lshrrev_b32_e32 v8, 16, v8
|
||||
; GCN-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; GCN-NEXT: v_lshr_b64 v[2:3], v[3:4], 16
|
||||
; GCN-NEXT: v_lshr_b64 v[3:4], v[5:6], 16
|
||||
; GCN-NEXT: v_lshr_b64 v[4:5], v[7:8], 16
|
||||
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
|
||||
; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
@ -46229,21 +46229,21 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
|
||||
; GFX7-LABEL: s_select_v4bf16:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s0
|
||||
; GFX7-NEXT: v_alignbit_b32 v1, v1, v2, 16
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
||||
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
|
||||
; GFX7-NEXT: v_lshr_b64 v[1:2], v[1:2], 16
|
||||
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s5
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s4
|
||||
; GFX7-NEXT: v_alignbit_b32 v2, v2, v3, 16
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
||||
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
|
||||
; GFX7-NEXT: v_lshr_b64 v[2:3], v[2:3], 16
|
||||
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s3
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s2
|
||||
; GFX7-NEXT: v_alignbit_b32 v3, v3, v4, 16
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v3
|
||||
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s2
|
||||
; GFX7-NEXT: v_lshr_b64 v[3:4], v[3:4], 16
|
||||
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s7
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
|
||||
; GFX7-NEXT: v_mul_f32_e64 v5, 1.0, s6
|
||||
; GFX7-NEXT: v_alignbit_b32 v4, v4, v5, 16
|
||||
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
||||
; GFX7-NEXT: v_mul_f32_e64 v4, 1.0, s6
|
||||
; GFX7-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
|
||||
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
|
||||
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
|
||||
; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
|
||||
|
||||
@ -186,10 +186,12 @@ define amdgpu_kernel void @build_vector_v2i16_trunc (ptr addrspace(1) %out, i32
|
||||
; GFX6: ; %bb.0:
|
||||
; GFX6-NEXT: s_load_dword s6, s[4:5], 0xb
|
||||
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; GFX6-NEXT: s_mov_b32 s7, 5
|
||||
; GFX6-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NEXT: v_alignbit_b32 v0, 5, s6, 16
|
||||
; GFX6-NEXT: s_lshr_b64 s[4:5], s[6:7], 16
|
||||
; GFX6-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
|
||||
; GFX6-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -487,13 +487,13 @@ define amdgpu_kernel void @uniform_vec_i16_HH(ptr addrspace(1) %out, i32 %a, i32
|
||||
; GCN: ; %bb.0:
|
||||
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GCN-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NEXT: s_lshr_b32 s3, s3, 16
|
||||
; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], 16
|
||||
; GCN-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NEXT: s_lshr_b32 s0, s3, 16
|
||||
; GCN-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-NEXT: v_alignbit_b32 v0, s0, v0, 16
|
||||
; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; GCN-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -5023,20 +5023,20 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f32_sign_v2bf16(<2 x float> in
|
||||
; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff
|
||||
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s0, s0
|
||||
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s3, s3, s6
|
||||
; GFX8-NEXT: s_bfe_u32 s0, s1, 0x10010
|
||||
; GFX8-NEXT: s_add_i32 s0, s0, s1
|
||||
; GFX8-NEXT: s_or_b32 s4, s1, 0x400000
|
||||
; GFX8-NEXT: s_add_i32 s5, s0, 0x7fff
|
||||
; GFX8-NEXT: v_cmp_u_f32_e64 s[0:1], s1, s1
|
||||
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s4, s5
|
||||
; GFX8-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16
|
||||
; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s3, s6
|
||||
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010
|
||||
; GFX8-NEXT: s_add_i32 s4, s4, s1
|
||||
; GFX8-NEXT: s_or_b32 s3, s1, 0x400000
|
||||
; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff
|
||||
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
|
||||
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s1, s3, s6
|
||||
; GFX8-NEXT: s_lshr_b32 s1, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
|
||||
; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -5185,29 +5185,29 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2f64_sign_v2bf16(<2 x double> i
|
||||
; GFX8-NEXT: s_addk_i32 s8, 0x7fff
|
||||
; GFX8-NEXT: s_bitset1_b32 s5, 22
|
||||
; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s5, s5, s8
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s8, v3
|
||||
; GFX8-NEXT: s_bitcmp1_b32 s8, 0
|
||||
; GFX8-NEXT: s_cselect_b64 s[6:7], -1, 0
|
||||
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
|
||||
; GFX8-NEXT: s_and_b64 s[6:7], s[12:13], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s6, s5, s8
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s5, v3
|
||||
; GFX8-NEXT: s_bitcmp1_b32 s5, 0
|
||||
; GFX8-NEXT: s_cselect_b64 s[8:9], -1, 0
|
||||
; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[8:9]
|
||||
; GFX8-NEXT: s_and_b64 s[8:9], s[12:13], exec
|
||||
; GFX8-NEXT: v_cmp_u_f64_e64 s[2:3], s[2:3], s[2:3]
|
||||
; GFX8-NEXT: s_cselect_b32 s6, 1, -1
|
||||
; GFX8-NEXT: s_add_i32 s6, s8, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s7, 1, -1
|
||||
; GFX8-NEXT: s_add_i32 s7, s5, s7
|
||||
; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s8, s6
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s5, s7
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s0, 0x10010
|
||||
; GFX8-NEXT: s_add_i32 s1, s1, s0
|
||||
; GFX8-NEXT: s_add_i32 s6, s1, 0x7fff
|
||||
; GFX8-NEXT: s_add_i32 s5, s1, 0x7fff
|
||||
; GFX8-NEXT: s_or_b32 s7, s0, 0x400000
|
||||
; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s7, s6
|
||||
; GFX8-NEXT: s_lshr_b32 s0, s0, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s0, v0, 16
|
||||
; GFX8-NEXT: s_mov_b32 s0, 0x7fff7fff
|
||||
; GFX8-NEXT: s_cselect_b32 s0, s7, s5
|
||||
; GFX8-NEXT: s_lshr_b32 s7, s0, 16
|
||||
; GFX8-NEXT: s_lshr_b64 s[0:1], s[6:7], 16
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
|
||||
; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
@ -5421,19 +5421,19 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
|
||||
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
|
||||
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
|
||||
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s1, s1, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
|
||||
; GFX8-NEXT: s_add_i32 s3, s3, s2
|
||||
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
|
||||
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s2, s2
|
||||
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s2, s2, s3
|
||||
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX8-NEXT: v_alignbit_b32 v0, s2, v0, 16
|
||||
; GFX8-NEXT: s_cselect_b32 s4, s1, s3
|
||||
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x10010
|
||||
; GFX8-NEXT: s_add_i32 s1, s1, s2
|
||||
; GFX8-NEXT: s_addk_i32 s1, 0x7fff
|
||||
; GFX8-NEXT: v_cmp_u_f32_e64 s[6:7], s2, s2
|
||||
; GFX8-NEXT: s_and_b64 s[6:7], s[6:7], exec
|
||||
; GFX8-NEXT: s_cselect_b32 s1, s2, s1
|
||||
; GFX8-NEXT: s_lshr_b32 s5, s1, 16
|
||||
; GFX8-NEXT: s_lshr_b64 s[2:3], s[4:5], 16
|
||||
; GFX8-NEXT: s_mov_b32 s1, 0x7fff7fff
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NEXT: v_bfi_b32 v0, s1, v1, v0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX8-NEXT: v_bfi_b32 v0, s1, v0, v1
|
||||
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX8-NEXT: ; return to shader part epilog
|
||||
;
|
||||
|
||||
@ -409,7 +409,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
|
||||
; CI-NEXT: v_add_f32_e64 v1, s2, 2.0
|
||||
; CI-NEXT: v_add_f32_e64 v0, s3, 1.0
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
@ -441,7 +441,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out,
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
@ -709,16 +709,16 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
|
||||
; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_and_b32 s3, s2, 0x7fff0000
|
||||
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
|
||||
; CI-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; CI-NEXT: v_mul_f32_e64 v0, s3, -4.0
|
||||
; CI-NEXT: v_mul_f32_e64 v1, s2, -4.0
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_store_dword v[0:1], v2
|
||||
; CI-NEXT: s_and_b32 s3, s2, 0x7fff
|
||||
; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000
|
||||
; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0
|
||||
; CI-NEXT: s_lshl_b32 s2, s3, 16
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; CI-NEXT: v_mul_f32_e64 v0, s2, -4.0
|
||||
; CI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; CI-NEXT: flat_store_dword v[1:2], v0
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fold_user_fneg_fabs_v2bf16:
|
||||
@ -749,10 +749,10 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2bf16(ptr addrspace(1) %out, <2
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
|
||||
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: v_alignbit_b32 v2, v1, v0, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: flat_store_dword v[1:2], v0
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fold_user_fneg_fabs_v2bf16:
|
||||
@ -956,17 +956,17 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
|
||||
; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000
|
||||
; CI-NEXT: v_mul_f32_e64 v4, s2, -4.0
|
||||
; CI-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
|
||||
; CI-NEXT: v_mul_f32_e64 v4, s1, -4.0
|
||||
; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000
|
||||
; CI-NEXT: v_mul_f32_e64 v5, s1, -4.0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
||||
; CI-NEXT: v_alignbit_b32 v4, v5, v4, 16
|
||||
; CI-NEXT: v_lshr_b64 v[4:5], v[4:5], 16
|
||||
; CI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v3, s3
|
||||
; CI-NEXT: flat_store_dword v[0:1], v5
|
||||
; CI-NEXT: flat_store_dword v[2:3], v4
|
||||
@ -1000,10 +1000,10 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2bf16(ptr addrspa
|
||||
; VI-NEXT: v_or_b32_e32 v7, 0x400000, v5
|
||||
; VI-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
|
||||
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
|
||||
; VI-NEXT: v_alignbit_b32 v4, v5, v4, 16
|
||||
; VI-NEXT: v_lshrrev_b64 v[4:5], 16, v[4:5]
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s3
|
||||
|
||||
@ -627,18 +627,18 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; CI-NEXT: flat_load_dword v2, v[0:1]
|
||||
; CI-NEXT: flat_load_dword v1, v[0:1]
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_waitcnt vmcnt(0)
|
||||
; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
||||
; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; CI-NEXT: v_mul_f32_e64 v2, -v2, v2
|
||||
; CI-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16
|
||||
; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v1
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v1
|
||||
; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
|
||||
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; CI-NEXT: v_mul_f32_e64 v4, -v1, v1
|
||||
; CI-NEXT: v_mul_f32_e32 v1, v2, v3
|
||||
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v4
|
||||
; CI-NEXT: v_lshr_b64 v[2:3], v[1:2], 16
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: flat_store_dword v[0:1], v2
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -648,34 +648,34 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa
|
||||
; GFX8-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX8-NEXT: v_mov_b32_e32 v3, 0x8000
|
||||
; GFX8-NEXT: v_mov_b32_e32 v2, 0x8000
|
||||
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX8-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GFX8-NEXT: flat_load_dword v1, v[0:1]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
|
||||
; GFX8-NEXT: v_xor_b32_sdwa v5, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
|
||||
; GFX8-NEXT: v_xor_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, v5, v4
|
||||
; GFX8-NEXT: v_mul_f32_e32 v2, v2, v6
|
||||
; GFX8-NEXT: v_bfe_u32 v4, v3, 16, 1
|
||||
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v3
|
||||
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
|
||||
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
|
||||
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v3
|
||||
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
|
||||
; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v1
|
||||
; GFX8-NEXT: v_xor_b32_sdwa v4, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
|
||||
; GFX8-NEXT: v_xor_b32_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX8-NEXT: v_mul_f32_e32 v2, v4, v3
|
||||
; GFX8-NEXT: v_mul_f32_e32 v3, v1, v5
|
||||
; GFX8-NEXT: v_bfe_u32 v1, v2, 16, 1
|
||||
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
|
||||
; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
|
||||
; GFX8-NEXT: v_add_u32_e32 v1, vcc, 0x7fff, v1
|
||||
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
|
||||
; GFX8-NEXT: v_or_b32_e32 v4, 0x400000, v2
|
||||
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
|
||||
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v3
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
|
||||
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
|
||||
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
|
||||
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX8-NEXT: v_alignbit_b32 v2, v2, v3, 16
|
||||
; GFX8-NEXT: v_lshrrev_b64 v[2:3], 16, v[1:2]
|
||||
; GFX8-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NEXT: flat_store_dword v[0:1], v2
|
||||
; GFX8-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -18,12 +18,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; SI-NEXT: s_lshr_b32 s1, s0, 1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
|
||||
; SI-NEXT: s_not_b32 s0, s2
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; SI-NEXT: v_alignbit_b32 v0, s1, v0, v1
|
||||
; SI-NEXT: s_mov_b32 s8, s1
|
||||
; SI-NEXT: s_mov_b32 s9, s0
|
||||
; SI-NEXT: s_lshr_b32 s3, s0, 1
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
|
||||
; SI-NEXT: s_not_b32 s2, s2
|
||||
; SI-NEXT: s_mov_b32 s1, s3
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -32,14 +35,17 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s1
|
||||
; VI-NEXT: s_mov_b32 s6, s1
|
||||
; VI-NEXT: s_mov_b32 s7, s0
|
||||
; VI-NEXT: s_lshr_b32 s3, s0, 1
|
||||
; VI-NEXT: s_not_b32 s2, s2
|
||||
; VI-NEXT: s_lshr_b32 s1, s0, 1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; VI-NEXT: v_alignbit_b32 v2, s1, v0, v1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
|
||||
; VI-NEXT: s_mov_b32 s1, s3
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -49,12 +55,15 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: s_mov_b32 s4, s1
|
||||
; GFX9-NEXT: s_mov_b32 s5, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s0, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
|
||||
; GFX9-NEXT: s_not_b32 s2, s2
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s0, 1
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s0, v1, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s1, v1, v2
|
||||
; GFX9-NEXT: s_mov_b32 s1, s3
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -77,13 +86,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, s1, 1
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; GFX10-NEXT: s_not_b32 s1, s2
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, v0, s1
|
||||
; GFX10-NEXT: global_store_dword v1, v0, s[6:7]
|
||||
; GFX10-NEXT: s_mov_b32 s4, s1
|
||||
; GFX10-NEXT: s_mov_b32 s5, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s3, s0, 1
|
||||
; GFX10-NEXT: s_not_b32 s2, s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
|
||||
; GFX10-NEXT: s_mov_b32 s1, s3
|
||||
; GFX10-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fshl_i32:
|
||||
@ -91,14 +105,18 @@ define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v1, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, s1, 1
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; GFX11-NEXT: s_not_b32 s1, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, v0, s1
|
||||
; GFX11-NEXT: global_store_b32 v1, v0, s[4:5]
|
||||
; GFX11-NEXT: s_mov_b32 s6, s1
|
||||
; GFX11-NEXT: s_mov_b32 s7, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s3, s0, 1
|
||||
; GFX11-NEXT: s_not_b32 s2, s2
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[6:7], 1
|
||||
; GFX11-NEXT: s_mov_b32 s1, s3
|
||||
; GFX11-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
%0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
|
||||
@ -113,10 +131,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s2, v0, 25
|
||||
; SI-NEXT: s_mov_b32 s0, s3
|
||||
; SI-NEXT: s_mov_b32 s1, s2
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 25
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -124,10 +144,12 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: v_alignbit_b32 v2, s2, v0, 25
|
||||
; VI-NEXT: s_mov_b32 s4, s3
|
||||
; VI-NEXT: s_mov_b32 s5, s2
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[4:5], 25
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -136,8 +158,10 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s2, v1, 25
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s2
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[4:5], 25
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -158,16 +182,22 @@ define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
|
||||
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s2, s3, 25
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s5, s2
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 25
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
; GFX11-LABEL: fshl_i32_imm:
|
||||
; GFX11: ; %bb.0: ; %entry
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s2, s3, 25
|
||||
; GFX11-NEXT: s_mov_b32 s4, s3
|
||||
; GFX11-NEXT: s_mov_b32 s5, s2
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[4:5], 25
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -185,41 +215,51 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; SI-NEXT: s_mov_b32 s11, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s10, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_alignbit_b32 v0, s1, v0, 1
|
||||
; SI-NEXT: s_not_b32 s3, s5
|
||||
; SI-NEXT: s_lshr_b32 s1, s1, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; SI-NEXT: v_alignbit_b32 v1, s1, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; SI-NEXT: s_not_b32 s1, s4
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, v0, 1
|
||||
; SI-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, v0, v2
|
||||
; SI-NEXT: s_mov_b32 s6, s3
|
||||
; SI-NEXT: s_mov_b32 s7, s1
|
||||
; SI-NEXT: s_lshr_b32 s12, s1, 1
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
|
||||
; SI-NEXT: s_not_b32 s1, s5
|
||||
; SI-NEXT: s_mov_b32 s7, s12
|
||||
; SI-NEXT: s_and_b32 s1, s1, 31
|
||||
; SI-NEXT: s_mov_b32 s3, s0
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[6:7], s1
|
||||
; SI-NEXT: s_lshr_b32 s5, s0, 1
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
|
||||
; SI-NEXT: s_not_b32 s2, s4
|
||||
; SI-NEXT: s_mov_b32 s1, s5
|
||||
; SI-NEXT: s_and_b32 s2, s2, 31
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshl_v2i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x3c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x3c
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: s_not_b32 s7, s7
|
||||
; VI-NEXT: s_lshr_b32 s3, s1, 1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s1, v0, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s7
|
||||
; VI-NEXT: v_alignbit_b32 v1, s3, v0, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: s_not_b32 s1, s6
|
||||
; VI-NEXT: v_alignbit_b32 v0, s0, v0, 1
|
||||
; VI-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s0, v0, v2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: s_mov_b32 s8, s3
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 1
|
||||
; VI-NEXT: s_not_b32 s1, s5
|
||||
; VI-NEXT: s_mov_b32 s9, s10
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_mov_b32 s3, s0
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[8:9], s1
|
||||
; VI-NEXT: s_lshr_b32 s5, s0, 1
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
|
||||
; VI-NEXT: s_not_b32 s2, s4
|
||||
; VI-NEXT: s_mov_b32 s1, s5
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -230,18 +270,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x3c
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s1, 1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, 1
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
|
||||
; GFX9-NEXT: s_not_b32 s1, s9
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s3, v0, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: s_not_b32 s1, s8
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s0, v0, v3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s10
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s1
|
||||
; GFX9-NEXT: s_mov_b32 s3, s0
|
||||
; GFX9-NEXT: s_lshr_b32 s5, s0, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[2:3], 1
|
||||
; GFX9-NEXT: s_not_b32 s2, s8
|
||||
; GFX9-NEXT: s_mov_b32 s1, s5
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -271,14 +316,23 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s1, s3, 1
|
||||
; GFX10-NEXT: v_alignbit_b32 v3, s0, s2, 1
|
||||
; GFX10-NEXT: s_lshr_b32 s1, s1, 1
|
||||
; GFX10-NEXT: s_not_b32 s2, s7
|
||||
; GFX10-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; GFX10-NEXT: s_not_b32 s3, s6
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s1, v0, s2
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, v3, s3
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_mov_b32 s3, s0
|
||||
; GFX10-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; GFX10-NEXT: s_not_b32 s7, s7
|
||||
; GFX10-NEXT: s_lshr_b32 s11, s0, 1
|
||||
; GFX10-NEXT: s_not_b32 s6, s6
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[4:5], 1
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_and_b32 s4, s7, 31
|
||||
; GFX10-NEXT: s_and_b32 s5, s6, 31
|
||||
; GFX10-NEXT: s_mov_b32 s3, s11
|
||||
; GFX10-NEXT: s_mov_b32 s1, s10
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -288,16 +342,25 @@ define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[6:7], s[4:5], 0x3c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s1, s3, 1
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, s0, s2, 1
|
||||
; GFX11-NEXT: s_lshr_b32 s1, s1, 1
|
||||
; GFX11-NEXT: s_not_b32 s2, s7
|
||||
; GFX11-NEXT: s_lshr_b32 s0, s0, 1
|
||||
; GFX11-NEXT: s_not_b32 s3, s6
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s1, v0, s2
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, v3, s3
|
||||
; GFX11-NEXT: s_mov_b32 s8, s3
|
||||
; GFX11-NEXT: s_mov_b32 s9, s1
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
; GFX11-NEXT: s_lshr_b32 s10, s1, 1
|
||||
; GFX11-NEXT: s_not_b32 s7, s7
|
||||
; GFX11-NEXT: s_lshr_b32 s11, s0, 1
|
||||
; GFX11-NEXT: s_not_b32 s6, s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[8:9], 1
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
|
||||
; GFX11-NEXT: s_and_b32 s7, s7, 31
|
||||
; GFX11-NEXT: s_and_b32 s6, s6, 31
|
||||
; GFX11-NEXT: s_mov_b32 s3, s11
|
||||
; GFX11-NEXT: s_mov_b32 s1, s10
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s6
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s7
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s0
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -314,10 +377,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s6, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; SI-NEXT: v_alignbit_b32 v1, s1, v0, 23
|
||||
; SI-NEXT: v_alignbit_b32 v0, s0, v2, 25
|
||||
; SI-NEXT: s_mov_b32 s8, s3
|
||||
; SI-NEXT: s_mov_b32 s9, s1
|
||||
; SI-NEXT: s_mov_b32 s3, s0
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 23
|
||||
; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -326,11 +392,14 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s3
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_alignbit_b32 v1, s1, v0, 23
|
||||
; VI-NEXT: v_alignbit_b32 v0, s0, v2, 25
|
||||
; VI-NEXT: s_mov_b32 s6, s3
|
||||
; VI-NEXT: s_mov_b32 s7, s1
|
||||
; VI-NEXT: s_mov_b32 s3, s0
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[6:7], 23
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 25
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -341,10 +410,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s1, v0, 23
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s0, v3, 25
|
||||
; GFX9-NEXT: s_mov_b32 s4, s3
|
||||
; GFX9-NEXT: s_mov_b32 s5, s1
|
||||
; GFX9-NEXT: s_mov_b32 s3, s0
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[4:5], 23
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 25
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -369,8 +441,13 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s1, s3, 23
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s0, s2, 25
|
||||
; GFX10-NEXT: s_mov_b32 s4, s3
|
||||
; GFX10-NEXT: s_mov_b32 s3, s0
|
||||
; GFX10-NEXT: s_mov_b32 s5, s1
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[4:5], 23
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -379,10 +456,15 @@ define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s1, s3, 23
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s0, s2, 25
|
||||
; GFX11-NEXT: s_mov_b32 s6, s3
|
||||
; GFX11-NEXT: s_mov_b32 s3, s0
|
||||
; GFX11-NEXT: s_mov_b32 s7, s1
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[2:3], 25
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[6:7], 23
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -395,104 +477,134 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; SI-LABEL: fshl_v4i32:
|
||||
; SI: ; %bb.0: ; %entry
|
||||
; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd
|
||||
; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x15
|
||||
; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
|
||||
; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x15
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: s_not_b32 s5, s19
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; SI-NEXT: v_alignbit_b32 v0, s11, v0, 1
|
||||
; SI-NEXT: s_lshr_b32 s4, s11, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: v_alignbit_b32 v3, s4, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s14
|
||||
; SI-NEXT: s_not_b32 s5, s18
|
||||
; SI-NEXT: v_alignbit_b32 v0, s10, v0, 1
|
||||
; SI-NEXT: s_lshr_b32 s4, s10, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: v_alignbit_b32 v2, s4, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: s_not_b32 s5, s17
|
||||
; SI-NEXT: v_alignbit_b32 v0, s9, v0, 1
|
||||
; SI-NEXT: s_lshr_b32 s4, s9, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s5
|
||||
; SI-NEXT: v_alignbit_b32 v1, s4, v0, v1
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: s_not_b32 s5, s16
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, v0, 1
|
||||
; SI-NEXT: s_lshr_b32 s4, s8, 1
|
||||
; SI-NEXT: v_mov_b32_e32 v4, s5
|
||||
; SI-NEXT: v_alignbit_b32 v0, s4, v0, v4
|
||||
; SI-NEXT: s_mov_b32 s16, s15
|
||||
; SI-NEXT: s_mov_b32 s17, s11
|
||||
; SI-NEXT: s_lshr_b32 s18, s11, 1
|
||||
; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 1
|
||||
; SI-NEXT: s_not_b32 s7, s7
|
||||
; SI-NEXT: s_mov_b32 s17, s18
|
||||
; SI-NEXT: s_and_b32 s7, s7, 31
|
||||
; SI-NEXT: s_mov_b32 s15, s10
|
||||
; SI-NEXT: s_lshr_b64 s[16:17], s[16:17], s7
|
||||
; SI-NEXT: s_lshr_b32 s7, s10, 1
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1
|
||||
; SI-NEXT: s_not_b32 s6, s6
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_and_b32 s6, s6, 31
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[10:11], s6
|
||||
; SI-NEXT: s_mov_b32 s10, s13
|
||||
; SI-NEXT: s_mov_b32 s11, s9
|
||||
; SI-NEXT: s_lshr_b32 s7, s9, 1
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1
|
||||
; SI-NEXT: s_not_b32 s5, s5
|
||||
; SI-NEXT: s_mov_b32 s11, s7
|
||||
; SI-NEXT: s_and_b32 s5, s5, 31
|
||||
; SI-NEXT: s_mov_b32 s13, s8
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], s5
|
||||
; SI-NEXT: s_lshr_b32 s5, s8, 1
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
|
||||
; SI-NEXT: s_not_b32 s4, s4
|
||||
; SI-NEXT: s_mov_b32 s9, s5
|
||||
; SI-NEXT: s_and_b32 s4, s4, 31
|
||||
; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], s4
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s4
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s16
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
; VI-LABEL: fshl_v4i32:
|
||||
; VI: ; %bb.0: ; %entry
|
||||
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; VI-NEXT: s_mov_b32 s4, s15
|
||||
; VI-NEXT: s_mov_b32 s5, s11
|
||||
; VI-NEXT: s_lshr_b32 s16, s11, 1
|
||||
; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
|
||||
; VI-NEXT: s_not_b32 s3, s3
|
||||
; VI-NEXT: s_lshr_b32 s6, s11, 1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s11, v0, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s3
|
||||
; VI-NEXT: v_alignbit_b32 v3, s6, v0, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s14
|
||||
; VI-NEXT: s_not_b32 s2, s2
|
||||
; VI-NEXT: v_alignbit_b32 v0, s10, v0, 1
|
||||
; VI-NEXT: s_mov_b32 s5, s16
|
||||
; VI-NEXT: s_and_b32 s3, s3, 31
|
||||
; VI-NEXT: s_mov_b32 s15, s10
|
||||
; VI-NEXT: s_lshr_b64 s[4:5], s[4:5], s3
|
||||
; VI-NEXT: s_lshr_b32 s3, s10, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s2
|
||||
; VI-NEXT: v_alignbit_b32 v2, s3, v0, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; VI-NEXT: s_lshr_b64 s[10:11], s[14:15], 1
|
||||
; VI-NEXT: s_not_b32 s2, s2
|
||||
; VI-NEXT: s_mov_b32 s11, s3
|
||||
; VI-NEXT: s_and_b32 s2, s2, 31
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[10:11], s2
|
||||
; VI-NEXT: s_mov_b32 s10, s13
|
||||
; VI-NEXT: s_mov_b32 s11, s9
|
||||
; VI-NEXT: s_lshr_b32 s3, s9, 1
|
||||
; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 1
|
||||
; VI-NEXT: s_not_b32 s1, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s9, v0, 1
|
||||
; VI-NEXT: s_lshr_b32 s2, s9, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_alignbit_b32 v1, s2, v0, v1
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; VI-NEXT: s_not_b32 s0, s0
|
||||
; VI-NEXT: v_alignbit_b32 v0, s8, v0, 1
|
||||
; VI-NEXT: s_mov_b32 s11, s3
|
||||
; VI-NEXT: s_and_b32 s1, s1, 31
|
||||
; VI-NEXT: s_mov_b32 s13, s8
|
||||
; VI-NEXT: s_lshr_b64 s[10:11], s[10:11], s1
|
||||
; VI-NEXT: s_lshr_b32 s1, s8, 1
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s5
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
|
||||
; VI-NEXT: s_not_b32 s0, s0
|
||||
; VI-NEXT: s_mov_b32 s9, s1
|
||||
; VI-NEXT: s_and_b32 s0, s0, 31
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s7
|
||||
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
; GFX9-LABEL: fshl_v4i32:
|
||||
; GFX9: ; %bb.0: ; %entry
|
||||
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: s_mov_b32 s4, s15
|
||||
; GFX9-NEXT: s_mov_b32 s5, s11
|
||||
; GFX9-NEXT: s_lshr_b32 s16, s11, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
|
||||
; GFX9-NEXT: s_not_b32 s3, s3
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s15
|
||||
; GFX9-NEXT: s_lshr_b32 s4, s11, 1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s11, v0, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GFX9-NEXT: v_alignbit_b32 v3, s4, v0, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GFX9-NEXT: s_not_b32 s2, s2
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s10, v0, 1
|
||||
; GFX9-NEXT: s_mov_b32 s5, s16
|
||||
; GFX9-NEXT: s_and_b32 s3, s3, 31
|
||||
; GFX9-NEXT: s_mov_b32 s15, s10
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[4:5], s3
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s10, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX9-NEXT: v_alignbit_b32 v2, s3, v0, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s13
|
||||
; GFX9-NEXT: s_lshr_b64 s[10:11], s[14:15], 1
|
||||
; GFX9-NEXT: s_not_b32 s2, s2
|
||||
; GFX9-NEXT: s_mov_b32 s11, s3
|
||||
; GFX9-NEXT: s_and_b32 s2, s2, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[10:11], s2
|
||||
; GFX9-NEXT: s_mov_b32 s10, s13
|
||||
; GFX9-NEXT: s_mov_b32 s11, s9
|
||||
; GFX9-NEXT: s_lshr_b32 s3, s9, 1
|
||||
; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], 1
|
||||
; GFX9-NEXT: s_not_b32 s1, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s9, v0, 1
|
||||
; GFX9-NEXT: s_lshr_b32 s2, s9, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s2, v0, v1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX9-NEXT: s_not_b32 s0, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 1
|
||||
; GFX9-NEXT: s_mov_b32 s11, s3
|
||||
; GFX9-NEXT: s_and_b32 s1, s1, 31
|
||||
; GFX9-NEXT: s_mov_b32 s13, s8
|
||||
; GFX9-NEXT: s_lshr_b64 s[10:11], s[10:11], s1
|
||||
; GFX9-NEXT: s_lshr_b32 s1, s8, 1
|
||||
; GFX9-NEXT: v_mov_b32_e32 v5, s0
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s1, v0, v5
|
||||
; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 1
|
||||
; GFX9-NEXT: s_not_b32 s0, s0
|
||||
; GFX9-NEXT: s_mov_b32 s9, s1
|
||||
; GFX9-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[0:1], s[8:9], s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -530,22 +642,40 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s11, s15, 1
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s10, s14, 1
|
||||
; GFX10-NEXT: v_alignbit_b32 v5, s9, s13, 1
|
||||
; GFX10-NEXT: v_alignbit_b32 v6, s8, s12, 1
|
||||
; GFX10-NEXT: s_lshr_b32 s4, s11, 1
|
||||
; GFX10-NEXT: s_not_b32 s3, s3
|
||||
; GFX10-NEXT: s_lshr_b32 s5, s10, 1
|
||||
; GFX10-NEXT: s_not_b32 s2, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s9, s9, 1
|
||||
; GFX10-NEXT: s_mov_b32 s4, s15
|
||||
; GFX10-NEXT: s_mov_b32 s5, s11
|
||||
; GFX10-NEXT: s_mov_b32 s15, s10
|
||||
; GFX10-NEXT: s_lshr_b32 s16, s11, 1
|
||||
; GFX10-NEXT: s_not_b32 s11, s3
|
||||
; GFX10-NEXT: s_lshr_b32 s17, s10, 1
|
||||
; GFX10-NEXT: s_not_b32 s10, s2
|
||||
; GFX10-NEXT: s_lshr_b32 s18, s9, 1
|
||||
; GFX10-NEXT: s_mov_b32 s2, s13
|
||||
; GFX10-NEXT: s_mov_b32 s3, s9
|
||||
; GFX10-NEXT: s_lshr_b32 s19, s8, 1
|
||||
; GFX10-NEXT: s_mov_b32 s13, s8
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 1
|
||||
; GFX10-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
|
||||
; GFX10-NEXT: s_and_b32 s11, s11, 31
|
||||
; GFX10-NEXT: s_and_b32 s10, s10, 31
|
||||
; GFX10-NEXT: s_mov_b32 s5, s16
|
||||
; GFX10-NEXT: s_mov_b32 s9, s17
|
||||
; GFX10-NEXT: s_not_b32 s1, s1
|
||||
; GFX10-NEXT: s_lshr_b32 s8, s8, 1
|
||||
; GFX10-NEXT: s_not_b32 s0, s0
|
||||
; GFX10-NEXT: v_alignbit_b32 v3, s4, v0, s3
|
||||
; GFX10-NEXT: v_alignbit_b32 v2, s5, v1, s2
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s9, v5, s1
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s8, v6, s0
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], s11
|
||||
; GFX10-NEXT: s_lshr_b64 s[8:9], s[8:9], s10
|
||||
; GFX10-NEXT: s_lshr_b64 s[10:11], s[12:13], 1
|
||||
; GFX10-NEXT: s_mov_b32 s3, s18
|
||||
; GFX10-NEXT: s_mov_b32 s11, s19
|
||||
; GFX10-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX10-NEXT: s_and_b32 s5, s1, 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[0:1], s[10:11], s0
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], s5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -555,24 +685,41 @@ define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i
|
||||
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x54
|
||||
; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s11, s15, 1
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s10, s14, 1
|
||||
; GFX11-NEXT: v_alignbit_b32 v5, s9, s13, 1
|
||||
; GFX11-NEXT: v_alignbit_b32 v6, s8, s12, 1
|
||||
; GFX11-NEXT: s_lshr_b32 s6, s11, 1
|
||||
; GFX11-NEXT: s_not_b32 s3, s3
|
||||
; GFX11-NEXT: s_lshr_b32 s7, s10, 1
|
||||
; GFX11-NEXT: s_not_b32 s2, s2
|
||||
; GFX11-NEXT: s_lshr_b32 s9, s9, 1
|
||||
; GFX11-NEXT: s_mov_b32 s6, s15
|
||||
; GFX11-NEXT: s_mov_b32 s7, s11
|
||||
; GFX11-NEXT: s_mov_b32 s15, s10
|
||||
; GFX11-NEXT: s_lshr_b32 s16, s11, 1
|
||||
; GFX11-NEXT: s_not_b32 s11, s3
|
||||
; GFX11-NEXT: s_lshr_b32 s17, s10, 1
|
||||
; GFX11-NEXT: s_not_b32 s10, s2
|
||||
; GFX11-NEXT: s_lshr_b32 s18, s9, 1
|
||||
; GFX11-NEXT: s_mov_b32 s2, s13
|
||||
; GFX11-NEXT: s_mov_b32 s3, s9
|
||||
; GFX11-NEXT: s_lshr_b32 s19, s8, 1
|
||||
; GFX11-NEXT: s_mov_b32 s13, s8
|
||||
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], 1
|
||||
; GFX11-NEXT: s_lshr_b64 s[8:9], s[14:15], 1
|
||||
; GFX11-NEXT: s_and_b32 s11, s11, 31
|
||||
; GFX11-NEXT: s_and_b32 s10, s10, 31
|
||||
; GFX11-NEXT: s_mov_b32 s7, s16
|
||||
; GFX11-NEXT: s_mov_b32 s9, s17
|
||||
; GFX11-NEXT: s_not_b32 s1, s1
|
||||
; GFX11-NEXT: s_lshr_b32 s8, s8, 1
|
||||
; GFX11-NEXT: s_not_b32 s0, s0
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, s6, v0, s3
|
||||
; GFX11-NEXT: v_alignbit_b32 v2, s7, v1, s2
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s9, v5, s1
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s8, v6, s0
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 1
|
||||
; GFX11-NEXT: s_lshr_b64 s[6:7], s[6:7], s11
|
||||
; GFX11-NEXT: s_lshr_b64 s[8:9], s[8:9], s10
|
||||
; GFX11-NEXT: s_lshr_b64 s[10:11], s[12:13], 1
|
||||
; GFX11-NEXT: s_mov_b32 s3, s18
|
||||
; GFX11-NEXT: s_mov_b32 s11, s19
|
||||
; GFX11-NEXT: s_and_b32 s0, s0, 31
|
||||
; GFX11-NEXT: s_and_b32 s7, s1, 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[0:1], s[10:11], s0
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], s7
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s2
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s6
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[4:5]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
@ -589,14 +736,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
|
||||
; SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; SI-NEXT: s_mov_b32 s2, -1
|
||||
; SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s14
|
||||
; SI-NEXT: v_alignbit_b32 v3, s11, v0, 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s13
|
||||
; SI-NEXT: v_alignbit_b32 v2, s10, v1, 23
|
||||
; SI-NEXT: v_alignbit_b32 v1, s9, v0, 25
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; SI-NEXT: v_alignbit_b32 v0, s8, v0, 31
|
||||
; SI-NEXT: s_mov_b32 s4, s15
|
||||
; SI-NEXT: s_mov_b32 s5, s11
|
||||
; SI-NEXT: s_mov_b32 s15, s10
|
||||
; SI-NEXT: s_mov_b32 s10, s13
|
||||
; SI-NEXT: s_mov_b32 s11, s9
|
||||
; SI-NEXT: s_mov_b32 s13, s8
|
||||
; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 31
|
||||
; SI-NEXT: s_lshr_b64 s[6:7], s[14:15], 23
|
||||
; SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 25
|
||||
; SI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31
|
||||
; SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; SI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; SI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; SI-NEXT: v_mov_b32_e32 v3, s4
|
||||
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -605,15 +758,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
|
||||
; VI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
|
||||
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s14
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s13
|
||||
; VI-NEXT: v_alignbit_b32 v3, s11, v0, 31
|
||||
; VI-NEXT: v_alignbit_b32 v2, s10, v1, 23
|
||||
; VI-NEXT: v_alignbit_b32 v1, s9, v4, 25
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s12
|
||||
; VI-NEXT: s_mov_b32 s2, s15
|
||||
; VI-NEXT: s_mov_b32 s3, s11
|
||||
; VI-NEXT: s_mov_b32 s15, s10
|
||||
; VI-NEXT: s_mov_b32 s6, s13
|
||||
; VI-NEXT: s_mov_b32 s7, s9
|
||||
; VI-NEXT: s_mov_b32 s13, s8
|
||||
; VI-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
|
||||
; VI-NEXT: s_lshr_b64 s[4:5], s[14:15], 23
|
||||
; VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 25
|
||||
; VI-NEXT: s_lshr_b64 s[8:9], s[12:13], 31
|
||||
; VI-NEXT: v_mov_b32_e32 v5, s1
|
||||
; VI-NEXT: v_alignbit_b32 v0, s8, v0, 31
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s6
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v3, s2
|
||||
; VI-NEXT: v_mov_b32_e32 v4, s0
|
||||
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -624,14 +783,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
|
||||
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX9-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s15
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s14
|
||||
; GFX9-NEXT: v_alignbit_b32 v3, s11, v0, 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s13
|
||||
; GFX9-NEXT: v_alignbit_b32 v2, s10, v1, 23
|
||||
; GFX9-NEXT: v_alignbit_b32 v1, s9, v0, 25
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s12
|
||||
; GFX9-NEXT: v_alignbit_b32 v0, s8, v0, 31
|
||||
; GFX9-NEXT: s_mov_b32 s2, s15
|
||||
; GFX9-NEXT: s_mov_b32 s3, s11
|
||||
; GFX9-NEXT: s_mov_b32 s15, s10
|
||||
; GFX9-NEXT: s_mov_b32 s6, s13
|
||||
; GFX9-NEXT: s_mov_b32 s7, s9
|
||||
; GFX9-NEXT: s_mov_b32 s13, s8
|
||||
; GFX9-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
|
||||
; GFX9-NEXT: s_lshr_b64 s[4:5], s[14:15], 23
|
||||
; GFX9-NEXT: s_lshr_b64 s[6:7], s[6:7], 25
|
||||
; GFX9-NEXT: s_lshr_b64 s[8:9], s[12:13], 31
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GFX9-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
@ -660,10 +825,20 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
|
||||
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX10-NEXT: v_alignbit_b32 v3, s11, s15, 31
|
||||
; GFX10-NEXT: v_alignbit_b32 v2, s10, s14, 23
|
||||
; GFX10-NEXT: v_alignbit_b32 v1, s9, s13, 25
|
||||
; GFX10-NEXT: v_alignbit_b32 v0, s8, s12, 31
|
||||
; GFX10-NEXT: s_mov_b32 s2, s15
|
||||
; GFX10-NEXT: s_mov_b32 s3, s11
|
||||
; GFX10-NEXT: s_mov_b32 s15, s10
|
||||
; GFX10-NEXT: s_mov_b32 s4, s13
|
||||
; GFX10-NEXT: s_mov_b32 s5, s9
|
||||
; GFX10-NEXT: s_mov_b32 s13, s8
|
||||
; GFX10-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[6:7], s[14:15], 23
|
||||
; GFX10-NEXT: s_lshr_b64 s[8:9], s[12:13], 31
|
||||
; GFX10-NEXT: s_lshr_b64 s[4:5], s[4:5], 25
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GFX10-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX10-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
;
|
||||
@ -672,12 +847,21 @@ define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4
|
||||
; GFX11-NEXT: s_clause 0x1
|
||||
; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x34
|
||||
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
|
||||
; GFX11-NEXT: v_mov_b32_e32 v4, 0
|
||||
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX11-NEXT: v_alignbit_b32 v3, s11, s15, 31
|
||||
; GFX11-NEXT: v_alignbit_b32 v2, s10, s14, 23
|
||||
; GFX11-NEXT: v_alignbit_b32 v1, s9, s13, 25
|
||||
; GFX11-NEXT: v_alignbit_b32 v0, s8, s12, 31
|
||||
; GFX11-NEXT: s_mov_b32 s2, s15
|
||||
; GFX11-NEXT: s_mov_b32 s3, s11
|
||||
; GFX11-NEXT: s_mov_b32 s15, s10
|
||||
; GFX11-NEXT: s_mov_b32 s4, s13
|
||||
; GFX11-NEXT: s_mov_b32 s5, s9
|
||||
; GFX11-NEXT: s_mov_b32 s13, s8
|
||||
; GFX11-NEXT: s_lshr_b64 s[2:3], s[2:3], 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[6:7], s[14:15], 23
|
||||
; GFX11-NEXT: s_lshr_b64 s[8:9], s[12:13], 31
|
||||
; GFX11-NEXT: s_lshr_b64 s[4:5], s[4:5], 25
|
||||
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
||||
; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s4
|
||||
; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v3, s2
|
||||
; GFX11-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GFX11-NEXT: global_store_b128 v4, v[0:3], s[0:1]
|
||||
; GFX11-NEXT: s_endpgm
|
||||
entry:
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -238,11 +238,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; VI-NEXT: v_alignbit_b32 v2, s0, v2, 16
|
||||
; VI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -256,11 +256,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_lshr_b32 s0, s2, 16
|
||||
; CI-NEXT: v_alignbit_b32 v2, s0, v2, 16
|
||||
; CI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: flat_store_dword v[0:1], v2
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -312,16 +312,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_lshr_b32 s3, s4, 16
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
|
||||
; VI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: ;;#ASMSTART
|
||||
; VI-NEXT: ; use s0
|
||||
; VI-NEXT: ; use s3
|
||||
; VI-NEXT: ;;#ASMEND
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -334,16 +334,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_lshr_b32 s3, s4, 16
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
|
||||
; CI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: flat_store_dword v[0:1], v2
|
||||
; CI-NEXT: ;;#ASMSTART
|
||||
; CI-NEXT: ; use s0
|
||||
; CI-NEXT: ; use s3
|
||||
; CI-NEXT: ;;#ASMEND
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
@ -405,19 +405,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
|
||||
; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; VI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; VI-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; VI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; VI-NEXT: s_lshr_b32 s3, s4, 16
|
||||
; VI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; VI-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; VI-NEXT: v_alignbit_b32 v2, s1, v2, 16
|
||||
; VI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; VI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; VI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; VI-NEXT: flat_store_dword v[0:1], v2
|
||||
; VI-NEXT: ;;#ASMSTART
|
||||
; VI-NEXT: ; use s0
|
||||
; VI-NEXT: ; use s3
|
||||
; VI-NEXT: ;;#ASMEND
|
||||
; VI-NEXT: ;;#ASMSTART
|
||||
; VI-NEXT: ; use s1
|
||||
; VI-NEXT: ; use s5
|
||||
; VI-NEXT: ;;#ASMEND
|
||||
; VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -430,19 +430,19 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
|
||||
; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; CI-NEXT: v_mov_b32_e32 v0, s0
|
||||
; CI-NEXT: s_lshr_b32 s0, s4, 16
|
||||
; CI-NEXT: v_mov_b32_e32 v1, s1
|
||||
; CI-NEXT: s_lshr_b32 s3, s4, 16
|
||||
; CI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; CI-NEXT: s_lshr_b32 s1, s2, 16
|
||||
; CI-NEXT: v_alignbit_b32 v2, s1, v2, 16
|
||||
; CI-NEXT: s_lshr_b32 s5, s2, 16
|
||||
; CI-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; CI-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CI-NEXT: flat_store_dword v[0:1], v2
|
||||
; CI-NEXT: ;;#ASMSTART
|
||||
; CI-NEXT: ; use s0
|
||||
; CI-NEXT: ; use s3
|
||||
; CI-NEXT: ;;#ASMEND
|
||||
; CI-NEXT: ;;#ASMSTART
|
||||
; CI-NEXT: ; use s1
|
||||
; CI-NEXT: ; use s5
|
||||
; CI-NEXT: ;;#ASMEND
|
||||
; CI-NEXT: s_endpgm
|
||||
;
|
||||
|
||||
@ -9775,17 +9775,17 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NOHSA-NEXT: s_load_dword s4, s[2:3], 0x0
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s5, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s6, s4, 24
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s6, v0, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s5
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s2, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s6, s2
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
||||
; GFX6-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -9800,15 +9800,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v2, s1, v2, 16
|
||||
; GFX7-HSA-NEXT: s_or_b32 s0, s2, s0
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-HSA-NEXT: s_and_b32 s4, s2, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s5, s0, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16
|
||||
; GFX7-HSA-NEXT: s_or_b32 s1, s4, s5
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX7-HSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -9820,15 +9820,15 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v2, s0, v2, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s2, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s5, s0, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s5
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GFX8-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -10062,26 +10062,28 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NOHSA-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s8, s5, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s5, 24
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s9, v0, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s2, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s7, s5, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s8, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s9, s5, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s6, s5
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s9, s9, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s7, v1, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s8
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s6
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s5, s2
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s10, s9
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s7, s11, s8
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX6-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -10096,24 +10098,26 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s5, v0, 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff00
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s1, s2, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s4, s3, 0xff00
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-HSA-NEXT: s_and_b32 s3, s3, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s4, s4, 8
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s1, v0, 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s1, s2, 0xff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s5, s3, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s6, s3, 0xff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s7, s2, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s5, s5, 8
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s3, s3, s4
|
||||
; GFX7-HSA-NEXT: s_or_b32 s0, s1, s0
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s5, s6, s5
|
||||
; GFX7-HSA-NEXT: s_or_b32 s6, s7, s0
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s0, s3
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s3, s4
|
||||
; GFX7-HSA-NEXT: s_and_b32 s7, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -10122,28 +10126,29 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
|
||||
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
||||
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOHSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s1, s3, 24
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s4, s3, 0x80010
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s5, s3, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 8
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s0, v0, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 8
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s1, s4, s1
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s3, s5, s3
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s0, s0, s2
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 16
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s5, s5, s3
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s2, 8
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s2, 24
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s4, s4, s1
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s2, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s6, s1, s3
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s3, s0
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -10500,43 +10505,48 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s2, -1
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s8, s6, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s6, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s10, s7, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s7, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s12, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s14, s5, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s15, v0, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s13, v1, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s11, v2, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s10, s10, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s9, v3, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s8, s8, 8
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s14
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s12
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s10
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s8
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v4
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s6
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s12, s6, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s9, s7, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s13, s6, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s11, s5, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s17, s5, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s19, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s10, s5
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s20, s7, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s8, s7
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s17, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[10:11], s[10:11], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s5, s15
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s14, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s11, s13, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s7, s12
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s7, s18, s17
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s9, s19, s16
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s10, s10, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s20, s5
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s11, s21, s11
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX6-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -10549,48 +10559,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
|
||||
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
|
||||
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s13, s5, 24
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s11, s4, 24
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s9, s7, 24
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s9, v0, 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s3, s6, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s8, s7, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s12, s5, 0xff00
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-HSA-NEXT: s_and_b32 s5, s5, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s10, s10, 8
|
||||
; GFX7-HSA-NEXT: s_and_b32 s7, s7, 0xff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s13, s5, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s8, s4, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s14, s5, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s13, s13, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s12, s4, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s13, s14, s13
|
||||
; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s8, s8, 8
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s3, v0, 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s3, s6, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s9, s5, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s14, s14, s8
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s8, s5
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s5, s12
|
||||
; GFX7-HSA-NEXT: s_and_b32 s11, s7, 0xff00
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s2, s6, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s5, s7, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s9, s11, 8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s5, s5, s9
|
||||
; GFX7-HSA-NEXT: s_and_b32 s9, s6, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s2, s2, 8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s5, s5, s12
|
||||
; GFX7-HSA-NEXT: s_or_b32 s4, s4, s10
|
||||
; GFX7-HSA-NEXT: s_or_b32 s7, s7, s8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s2, s3, s2
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s10, s6, 24
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s3, s7, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s9, s9, s2
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s2, s7
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s7, s10
|
||||
; GFX7-HSA-NEXT: s_and_b32 s11, s2, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s3
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s2
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s11
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s14
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s13
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: s_endpgm
|
||||
@ -10601,50 +10615,52 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
|
||||
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOHSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
|
||||
; GFX8-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s4, 24
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s3, v0, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s3, s4, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s4, s4, 8
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s5, 24
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s4, s4, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s5, 24
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s5, 0x80010
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s8, s8, 16
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s4, s3, s4
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s3, s7, 24
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s6, 24
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s8, s9, s8
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s9, s7, 0x80010
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s10, s5, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s5, s5, 8
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s3, s9, s3
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s9, s7, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s2, v0, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s3, 16
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s4, 24
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s5, s5, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s5, s10, s5
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s7, s9, s7
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s2, s2, s6
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s9, s9, s3
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s4, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s11, s4, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s10, s10, s5
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s5, s2
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s11, s11, s3
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[4:5], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s4, s2, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s2, s7, 24
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s2, s2, 16
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s3, s7, 0x80010
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s5, s3, s2
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s7, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s2, s7, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s8, s6, 24
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s12, s2, s3
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s3, s6, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s2, s6, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s3, s3, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s7, s8
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s13, s2, s3
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[2:3], s[6:7], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX8-NOHSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX8-NOHSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s7
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s2
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s8
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s10
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s9
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NOHSA-NEXT: s_endpgm
|
||||
@ -11272,81 +11288,92 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s11, 0xf000
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s10, -1
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s12, s6, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s6, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s14, s7, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s7, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s16, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s18, s5, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s5, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s20, s2, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s21, s2, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s22, s3, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s3, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s24, s0, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s25, s0, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s26, s1, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s27, s1, 24
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s1, s1, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s20, s6, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s13, s7, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s21, s6, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s22, s7, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s23, s4, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s15, s5, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s24, s4, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s25, s5, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s26, s2, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s17, s3, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s27, s2, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s28, s3, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s29, s0, 24
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b32 s19, s1, 24
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s30, s0, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s31, s1, 0xff00
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s33, s1, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s34, s0, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s18, s1
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s35, s3, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s36, s2, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s16, s3
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s37, s5, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s38, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s14, s5
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s39, s7, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s40, s6, 0xff
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s12, s7
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s31, s31, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s30, s30, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[18:19], s[18:19], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s1, s29
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s1, s28, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s19, s27, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[16:17], s[16:17], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s3, s26
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s3, s25, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s17, s24, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[14:15], s[14:15], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s5, s23
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s5, s22, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s15, s21, 8
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16
|
||||
; GFX6-NOHSA-NEXT: s_mov_b32 s7, s20
|
||||
; GFX6-NOHSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s7, s33, s31
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s13, s34, s30
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s18, s18, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s1, s35, s1
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s19, s36, s19
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s16, s16, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s3, s37, s3
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s17, s38, s17
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s14, s14, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s39, s5
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s15, s40, s15
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s12, s12, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s15
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s6
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s12
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:48
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s17
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s3, s3, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s2, s2, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s5, s5, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s4, s4, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s7, s7, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v7, s6
|
||||
; GFX6-NOHSA-NEXT: s_and_b32 s6, s6, 0xff
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v0, s27, v0, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s26, s26, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v1, s25, v1, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s24, s24, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v2, s23, v2, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s22, s22, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v8, s21, v3, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s20, s20, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v4, s19, v4, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s18, s18, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v9, s17, v5, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s16, s16, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v6, s15, v6, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s14, s14, 8
|
||||
; GFX6-NOHSA-NEXT: v_alignbit_b32 v10, s13, v7, 16
|
||||
; GFX6-NOHSA-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s1, s1, s26
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s0, s0, s24
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v2
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s3, s3, s22
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v8
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s2, s2, s20
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v4
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s5, s5, s18
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v9
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s4, s4, s16
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v6
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s7, s7, s14
|
||||
; GFX6-NOHSA-NEXT: s_or_b32 s6, s6, s12
|
||||
; GFX6-NOHSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v10
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v12, s6
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v14, s7
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v10, s5
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s14
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:32
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s16
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
||||
; GFX6-NOHSA-NEXT: s_waitcnt expcnt(0)
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v0, s13
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GFX6-NOHSA-NEXT: v_mov_b32_e32 v3, s18
|
||||
; GFX6-NOHSA-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
|
||||
; GFX6-NOHSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -11354,99 +11381,106 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
|
||||
; GFX7-HSA: ; %bb.0:
|
||||
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
|
||||
; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
|
||||
; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
|
||||
; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
|
||||
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
|
||||
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s25, s1, 24
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s1
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s25, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s23, s0, 24
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s23, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s21, s3, 24
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s21, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s19, s2, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s24, s1, 0xff00
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX7-HSA-NEXT: s_and_b32 s22, s0, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s1, s1, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s24, s24, 8
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s19, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s17, s5, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s22, s1, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s12, s0, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s23, s1, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s22, s22, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s21, s0, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s22, s23, s22
|
||||
; GFX7-HSA-NEXT: s_and_b32 s23, s0, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s13, s1, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s23, s23, s12
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s12, s1
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s1, s21
|
||||
; GFX7-HSA-NEXT: s_and_b32 s20, s3, 0xff00
|
||||
; GFX7-HSA-NEXT: s_or_b32 s24, s1, s24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s22, 8
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s5
|
||||
; GFX7-HSA-NEXT: s_and_b32 s18, s2, 0xff00
|
||||
; GFX7-HSA-NEXT: s_or_b32 s22, s0, s1
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[12:13], s[12:13], 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s19, s2, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s13, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s3, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s20, 8
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s17, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s15, s4, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s16, s5, 0xff00
|
||||
; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1
|
||||
; GFX7-HSA-NEXT: s_or_b32 s20, s0, s1
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s2, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s18, 8
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v11, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX7-HSA-NEXT: s_and_b32 s14, s4, 0xff00
|
||||
; GFX7-HSA-NEXT: s_or_b32 s2, s0, s1
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s19, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s18, s2, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s19, s0, s1
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s1, s3, 24
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s0, s3
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s3, s18
|
||||
; GFX7-HSA-NEXT: s_and_b32 s17, s5, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s21, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s10, s4, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s2, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s5, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s16, 8
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s15, v0, 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s12, s7, 0xff00
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s13, s7, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s17, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s16, s4, 24
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s11, s5, 24
|
||||
; GFX7-HSA-NEXT: s_or_b32 s3, s0, s1
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s4, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GFX7-HSA-NEXT: s_and_b32 s10, s6, 0xff00
|
||||
; GFX7-HSA-NEXT: s_or_b32 s4, s0, s1
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s10, 8
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s10, s5
|
||||
; GFX7-HSA-NEXT: s_or_b32 s17, s0, s1
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[10:11], 16
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s5, s16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s15, s7, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s10, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s14, s6, 0xff00
|
||||
; GFX7-HSA-NEXT: s_and_b32 s4, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s7, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s12, 8
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s11, s6, 24
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v15, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: s_or_b32 s0, s0, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-HSA-NEXT: s_and_b32 s1, s6, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s6, s10, 8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s1, s1, s6
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s15, 8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s5, s0, s1
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s6, 0xff
|
||||
; GFX7-HSA-NEXT: s_lshl_b32 s1, s14, 8
|
||||
; GFX7-HSA-NEXT: s_or_b32 s11, s0, s1
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s1, s7, 24
|
||||
; GFX7-HSA-NEXT: s_mov_b32 s0, s7
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX7-HSA-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GFX7-HSA-NEXT: s_and_b32 s14, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16
|
||||
; GFX7-HSA-NEXT: s_and_b32 s12, s12, 0xff00ff
|
||||
; GFX7-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 48
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s1
|
||||
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v17, s1
|
||||
; GFX7-HSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v16, s0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 32
|
||||
; GFX7-HSA-NEXT: v_and_b32_e32 v13, 0xff00ff, v0
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s14
|
||||
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s4
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v13, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v12, s0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-HSA-NEXT: s_add_u32 s0, s8, 16
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v10, s5
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s17
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GFX7-HSA-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s22
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s20
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s21
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s24
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s23
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s22
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s12
|
||||
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s9
|
||||
; GFX7-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX7-HSA-NEXT: s_endpgm
|
||||
@ -11463,90 +11497,94 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s1, 8
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s14, s14, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s14, s15, s14
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s15, s16, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s16, s0, 8
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s13, s0, 24
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s0, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s13, v0, 16
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s13, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s16, s16, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s16
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s1, s13
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s13, s0, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s3, 24
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s3, 0x80010
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s16, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s17, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s3, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s3, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s3, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s18, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s2, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s2, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s12, s2, 24
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s2, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s3, s12
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s19, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[2:3], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s2, s0, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s5, 24
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s5, 0x80010
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s12, v0, 16
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s12, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s3, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s5, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s5, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s5, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s12, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s4, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s4, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s11, s4, 24
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s4
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s4, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s5, s11
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s20, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[4:5], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s4, s0, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s0, s7, 24
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s0, s0, 16
|
||||
; GFX8-NOHSA-NEXT: s_bfe_u32 s1, s7, 0x80010
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s0, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s7, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s7, s7, 8
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s11, v0, 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s7, s7, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s5, s1, s0
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s7, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s7, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b32 s10, s6, 24
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s1, s1, s7
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s7, s6, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s6, s6, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s6, s6, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s6, s7, s6
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s0
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s11, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshl_b32 s1, s6, 8
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s6, 0xff
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s1, s1, 0xff0000
|
||||
; GFX8-NOHSA-NEXT: s_mov_b32 s7, s10
|
||||
; GFX8-NOHSA-NEXT: s_or_b32 s21, s0, s1
|
||||
; GFX8-NOHSA-NEXT: s_lshr_b64 s[0:1], s[6:7], 16
|
||||
; GFX8-NOHSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 48
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s1
|
||||
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GFX8-NOHSA-NEXT: v_alignbit_b32 v0, s10, v0, 16
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 32
|
||||
; GFX8-NOHSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v0
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s6
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s21
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s11
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[10:11], v[6:9]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s4
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NOHSA-NEXT: s_add_u32 s0, s8, 16
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s5
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s12
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s20
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s3
|
||||
; GFX8-NOHSA-NEXT: s_addc_u32 s1, s9, 0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s2
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v7, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s16
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v6, s0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[6:7], v[2:5]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s13
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s18
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s17
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GFX8-NOHSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v4, s8
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v1, s13
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v2, s15
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v3, s14
|
||||
; GFX8-NOHSA-NEXT: v_mov_b32_e32 v5, s9
|
||||
|
||||
@ -9828,14 +9828,14 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v2, v0, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v3, v2
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v3, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00ff, v0
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[1:2], off, s[4:7], 0
|
||||
; GCN-NOHSA-SI-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_zextload_v4i8_to_v4i16:
|
||||
@ -9847,18 +9847,18 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00, v2
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 24, v2
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v2
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v2, v4, v2, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v3
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v2
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v2, v5, v4
|
||||
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 24, v0
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v0
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff, v0
|
||||
; GCN-HSA-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v4, v5, v4
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v0
|
||||
; GCN-HSA-NEXT: flat_store_dwordx2 v[2:3], v[4:5]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-NOHSA-VI-LABEL: global_zextload_v4i8_to_v4i16:
|
||||
@ -9877,10 +9877,10 @@ define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(ptr addrspace(1) %out,
|
||||
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v2, 8, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v0, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, 0xff0000, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b64 v[1:2], 16, v[0:1]
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v1
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; GCN-NOHSA-VI-NEXT: s_endpgm
|
||||
;
|
||||
@ -10179,33 +10179,39 @@ define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(ptr addrspace(1) %out,
|
||||
define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
|
||||
; GCN-NOHSA-SI-LABEL: global_zextload_v8i8_to_v8i16:
|
||||
; GCN-NOHSA-SI: ; %bb.0:
|
||||
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
|
||||
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
||||
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v6, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v7, v5
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s1, s0, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s0, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s6, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s6, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s11, s0, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s9, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s8, s8, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s10, s9
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s8, s11, s8
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s9, s0, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s8
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s9
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-SI-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-HSA-LABEL: global_zextload_v8i8_to_v8i16:
|
||||
@ -10221,20 +10227,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xff00, v0
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v1
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff, v1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff, v0
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v1, v7, v1, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v0, v3, v0, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v7, 8, v2
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v2, v8, v6
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v0, v9, v7
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v1
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s1, s0, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GCN-HSA-NEXT: s_and_b32 s4, s0, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s5, s2, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s6, s2, 0xff
|
||||
; GCN-HSA-NEXT: s_and_b32 s7, s0, 0xff
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s5, s5, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s4, s4, 8
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
|
||||
; GCN-HSA-NEXT: s_or_b32 s1, s6, s5
|
||||
; GCN-HSA-NEXT: s_or_b32 s3, s7, s4
|
||||
; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_and_b32 s0, s0, 0xff00ff
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
@ -10252,22 +10264,26 @@ define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(ptr addrspace(1) %out,
|
||||
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v1
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v1
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s6, s4, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s4, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 24, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v2, v2, v0, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, 0xff0000, v1
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s6, s5
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s7, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s4
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s8, s6, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s6, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s4, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s4, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s11, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s7
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s9, s6
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s11
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s6
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-VI-NEXT: s_endpgm
|
||||
@ -10763,35 +10779,48 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 24, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff00, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 24, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff00, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 24, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xff, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v13, 0xff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, 0xff, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xff, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v1, v11, v1, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v10, 8, v10
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v0, v9, v0, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v7, v3, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v2, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v12, v10
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v13, v8
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v14, v6
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v5
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v15, v4
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s4, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s13, s6, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s8, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s15, s10, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s10, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s17, s8, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s6, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s19, s4, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s15, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s14, s14, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s13, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s12, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s16, s15
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s17, s14
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s18, s9
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s19, s11
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -10805,43 +10834,55 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v2
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v2
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v3
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v3
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff00, v0
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 24, v0
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v14, 0xff00, v1
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 24, v1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff, v3
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v19, 0xff, v2
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v3, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v6, 8, v6
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v5, v5, v2, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff, v1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v0
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v1, v15, v1, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v14, 8, v14
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v0, v13, v0, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v12, 8, v12
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v6, v18, v6
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v5
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v4, v19, v4
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v2, v16, v14
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v0, v17, v12
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff
|
||||
; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff
|
||||
; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16
|
||||
; GCN-HSA-NEXT: s_or_b32 s3, s14, s13
|
||||
; GCN-HSA-NEXT: s_or_b32 s5, s15, s12
|
||||
; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_or_b32 s7, s16, s7
|
||||
; GCN-HSA-NEXT: s_or_b32 s9, s17, s9
|
||||
; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s8
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-NOHSA-VI-LABEL: global_zextload_v16i8_to_v16i16:
|
||||
@ -10858,42 +10899,50 @@ define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(ptr addrspace(1) %out
|
||||
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s6, s5, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s4, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s10, s4, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s11, s4, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v4, 24, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v2
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s7, s5, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s5, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, 0xff0000, v1
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v4, v4, v2, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v7, 0xff0000, v5
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s7, s6
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s9
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s11, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v4
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v2, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s8, s5
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s7
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v3
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s9, v1
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s10, s9, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s6, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s15, s8, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s16, s8, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s17, s8, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s19, s4, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s11, s9, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s9, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s9, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s4, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s10, s10, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s15, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s19, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s9, s9, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s11, s10
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s16, s7
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s17, s8
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s15
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s9
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s13, s14
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s10
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s9
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s5
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
||||
; GCN-NOHSA-VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: global_zextload_v16i8_to_v16i16:
|
||||
@ -11766,71 +11815,97 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
|
||||
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v5, 8, v5
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v6, v5
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v7, v4
|
||||
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
|
||||
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v3, v5, v3, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 24, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v2, v3, v2, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v2
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s6, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00, v10
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:48
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 24, v10
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s12, v6
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s14, v7
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s16, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_readfirstlane_b32 s18, v5
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s20, s4, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s21, s6, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s9, s8, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s11, s10, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s22, s8, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s23, s10, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s13, s12, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s15, s14, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s24, s12, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s25, s14, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s17, s16, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b32 s19, s18, 24
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s26, s16, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s27, s18, 0xff00
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s28, s18, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s29, s16, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s30, s14, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s31, s12, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s33, s10, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s34, s8, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s35, s6, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s36, s4, 0xff
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s27, s27, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s26, s26, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[18:19], s[18:19], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[16:17], s[16:17], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s17, s25, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s19, s24, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[14:15], s[14:15], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[12:13], s[12:13], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s13, s23, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s15, s22, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s9, s21, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshl_b32 s11, s20, 8
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s5, s28, s27
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s7, s29, s26
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s18, s18, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s16, s16, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s17, s30, s17
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s19, s31, s19
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s14, s14, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s12, s12, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s13, s33, s13
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s15, s34, s15
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s10, s10, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s9, s35, s9
|
||||
; GCN-NOHSA-SI-NEXT: s_or_b32 s11, s36, s11
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s11
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s9
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s6
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v4, 24, v11
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 24, v8
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 24, v9
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v7, v6, v9, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v5, v5, v8, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v12, v4, v11, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v13, v2, v10, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xff00, v11
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, 0xff00, v8
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, 0xff00, v9
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff, v9
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xff, v8
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff, v11
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, 0xff, v10
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 24, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v14, v14, v1, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 24, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_alignbit_b32 v15, v15, v0, 16
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xff00, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v17, 0xff00, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff, v1
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, 0xff, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v0, 8, v6
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v6, 8, v2
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v9, v0
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v8, v6
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v11, v4
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v10, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v3, 8, v17
|
||||
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v8, 8, v16
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v10, v1, v3
|
||||
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v8, v18, v8
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v3, 0xff00ff, v7
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v1, 0xff00ff, v5
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xff00ff, v12
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v5, 0xff00ff, v13
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xff00ff, v14
|
||||
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v9, 0xff00ff, v15
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s15
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s8
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s13
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s12
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s17
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s14
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, s7
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, s16
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, s18
|
||||
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-SI-NEXT: s_endpgm
|
||||
;
|
||||
@ -11843,88 +11918,112 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
|
||||
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1]
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v3
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v0
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s10, v1
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-HSA-NEXT: s_and_b32 s12, s4, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s13, s6, 0xff00
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s11, s10, 24
|
||||
; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s15, s10, 0xff00
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
|
||||
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[10:11], 16
|
||||
; GCN-HSA-NEXT: s_and_b32 s17, s8, 0xff
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GCN-HSA-NEXT: s_and_b32 s3, s6, 0xff
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-HSA-NEXT: s_and_b32 s16, s10, 0xff
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s14, s14, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s9, s13, 8
|
||||
; GCN-HSA-NEXT: s_and_b32 s10, s4, 0xff
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s11, s12, 8
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s15, s15, 8
|
||||
; GCN-HSA-NEXT: s_or_b32 s7, s17, s14
|
||||
; GCN-HSA-NEXT: s_and_b32 s2, s2, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_or_b32 s3, s3, s9
|
||||
; GCN-HSA-NEXT: s_or_b32 s9, s10, s11
|
||||
; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s6
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s8
|
||||
; GCN-HSA-NEXT: s_or_b32 s5, s16, s15
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s9
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s7
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s5
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s8, v1
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s2, v2
|
||||
; GCN-HSA-NEXT: v_readfirstlane_b32 s4, v3
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s9, s8, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s3, s2, 24
|
||||
; GCN-HSA-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s11, s4, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s12, s6, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s13, s8, 0xff00
|
||||
; GCN-HSA-NEXT: s_and_b32 s14, s8, 0xff
|
||||
; GCN-HSA-NEXT: s_and_b32 s15, s6, 0xff
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-HSA-NEXT: s_and_b32 s16, s4, 0xff
|
||||
; GCN-HSA-NEXT: s_and_b32 s17, s2, 0xff
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s13, s13, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s7, s11, 8
|
||||
; GCN-HSA-NEXT: s_lshl_b32 s9, s10, 8
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-HSA-NEXT: s_lshr_b64 s[2:3], s[2:3], 16
|
||||
; GCN-HSA-NEXT: s_or_b32 s3, s14, s13
|
||||
; GCN-HSA-NEXT: s_or_b32 s5, s15, s12
|
||||
; GCN-HSA-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_or_b32 s7, s16, s7
|
||||
; GCN-HSA-NEXT: s_or_b32 s9, s17, s9
|
||||
; GCN-HSA-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_and_b32 s10, s2, 0xff00ff
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
|
||||
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
|
||||
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s9
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s4
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
|
||||
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v7
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v7
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v17, 0xff, v7
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v7, v9, v7, 16
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v16, 0xff00, v6
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v9, 0xff00ff, v7
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v6
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v7, v7, v6, 16
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v6
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v16, 8, v16
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v8, v17, v8
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v7
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v6, v6, v16
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[6:9]
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v12, 0xff, v5
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff00, v4
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 24, v4
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v5
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v5
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff, v4
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v5, v9, v5, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v9, v7, v4, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v6
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v6, v12, v8
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v4, v13, v4
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v5
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v9
|
||||
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v18, 0xff00, v2
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 24, v2
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v8, 0xff00, v3
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 24, v3
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v13, 0xff00, v0
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 24, v0
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v10, 0xff, v3
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v4, 0xff00, v1
|
||||
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 24, v1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v6, 0xff, v1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff, v0
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v11, 0xff, v2
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v1, v5, v1, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v4, 8, v4
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v0, v9, v0, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v5, 8, v13
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v9, v12, v3, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v8, 8, v8
|
||||
; GCN-HSA-NEXT: v_alignbit_b32 v12, v19, v2, 16
|
||||
; GCN-HSA-NEXT: v_lshlrev_b32_e32 v13, 8, v18
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v3, 0xff00ff, v1
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v2, v6, v4
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v1, 0xff00ff, v0
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v0, v7, v5
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v7, 0xff00ff, v9
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v6, v10, v8
|
||||
; GCN-HSA-NEXT: v_and_b32_e32 v5, 0xff00ff, v12
|
||||
; GCN-HSA-NEXT: v_or_b32_e32 v4, v11, v13
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[4:7]
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s5
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s8
|
||||
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
|
||||
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
|
||||
; GCN-HSA-NEXT: s_endpgm
|
||||
;
|
||||
; GCN-NOHSA-VI-LABEL: global_zextload_v32i8_to_v32i16:
|
||||
@ -11942,79 +12041,95 @@ define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(ptr addrspace(1) %out
|
||||
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
|
||||
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
|
||||
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v3
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v0
|
||||
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s6, v7
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s7, v5
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s5, v1
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s8, s7, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s6, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s17, s4, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s18, s4, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s4, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s4, s4, 8
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 24, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v5, 8, v4
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 8, v2
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s9, s7, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s7, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s7, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s12, s6, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s6, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s6, s6, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s14, s5, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s15, s5, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s16, s5, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s5, s5, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s8, s8, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s11, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s17, s17, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 24, v6
|
||||
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 24, v4
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 8, v6
|
||||
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v10, 8, v0
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, 0xff0000, v5
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v13, v8, v2, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v11, 0xff0000, v11
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s7, s7, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s5, s5, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s8, s9, s8
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s12, s11
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s18, s17
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s4, s19, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v7, v7, v4, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v1, v1, v6, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff0000, v9
|
||||
; GCN-NOHSA-VI-NEXT: v_alignbit_b32 v3, v3, v0, 16
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, 0xff0000, v10
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v4, v4, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v12, v2, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v13, 0xff00ff, v13
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s10, s7
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s6, s13, s6
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s10, s15, s14
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s16, s5
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, s11
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v5, 0xff00ff, v7
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v8, v6, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v9, 0xff00ff, v1
|
||||
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, 0xff00ff, v3
|
||||
; GCN-NOHSA-VI-NEXT: v_or_b32_sdwa v0, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s5
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s10
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s6
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s9
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s7
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s8
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s8, v6
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s4, v2
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s12, v3
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s13, v1
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s10, v4
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s15, v5
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s7, s6, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s9, s8, 24
|
||||
; GCN-NOHSA-VI-NEXT: v_readfirstlane_b32 s14, v7
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s5, s4, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s11, s10, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s16, s15, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s20, s10, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s24, s8, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s25, s8, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s26, s13, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s29, s6, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s30, s6, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s31, s12, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s33, s12, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s34, s12, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s12, s12, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s36, s4, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[8:9], s[8:9], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[6:7], s[6:7], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s17, s15, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s18, s15, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s15, s15, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s19, s10, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b32 s21, s14, 24
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s22, s14, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s23, s14, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s14, s14, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_bfe_u32 s27, s13, 0x80010
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s28, s13, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s13, s13, 8
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s35, s4, 0xff
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s16, s16, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s20, s20, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[10:11], s[10:11], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s9, s26, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s7, s31, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s12, s12, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s26, s36, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_lshr_b64 s[4:5], s[4:5], 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s15, s15, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_lshl_b32 s11, s21, 16
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s14, s14, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s21, s25, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s13, s13, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s25, s30, 0xff0000
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s5, s17, s16
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s16, s19, s20
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s7, s33, s7
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s12, s34, s12
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s19, s35, s26
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s4, s4, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s15, s18, s15
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s10, s10, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s11, s22, s11
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s14, s23, s14
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s17, s24, s21
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s8, s8, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s9, s27, s9
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s13, s28, s13
|
||||
; GCN-NOHSA-VI-NEXT: s_or_b32 s18, s29, s25
|
||||
; GCN-NOHSA-VI-NEXT: s_and_b32 s6, s6, 0xff00ff
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s19
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s4
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s12
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s7
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, s18
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, s6
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, s13
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, s9
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, s17
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, s8
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v10, s14
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, s11
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v0, s16
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, s10
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, s15
|
||||
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, s5
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
|
||||
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GCN-NOHSA-VI-NEXT: s_endpgm
|
||||
;
|
||||
; EG-LABEL: global_zextload_v32i8_to_v32i16:
|
||||
|
||||
@ -18,7 +18,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v0
|
||||
@ -40,7 +40,7 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(ptr addrspace(1) %out, ptr add
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v0
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
@ -86,7 +86,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
|
||||
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
|
||||
; SI-NEXT: s_waitcnt vmcnt(0)
|
||||
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; SI-NEXT: v_lshr_b64 v[0:1], v[0:1], 16
|
||||
; SI-NEXT: s_mov_b32 s4, s0
|
||||
; SI-NEXT: s_mov_b32 s5, s1
|
||||
; SI-NEXT: v_mov_b32_e32 v1, v0
|
||||
@ -108,7 +108,7 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(ptr addrspace(1) %out, ptr add
|
||||
; VI-NEXT: s_mov_b32 s5, s1
|
||||
; VI-NEXT: s_waitcnt vmcnt(0)
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
|
||||
; VI-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1]
|
||||
; VI-NEXT: v_mov_b32_e32 v1, v0
|
||||
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
|
||||
; VI-NEXT: s_endpgm
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user