
Similar to InstCombinerImpl::freezeOtherUses, attempt to ensure that we merge multiple frozen/unfrozen uses of a SDValue. This fixes a number of hasOneUse() problems when trying to push FREEZE nodes through the DAG. Remove SimplifyMultipleUseDemandedBits handling of FREEZE nodes as we now want to keep the common node, and not bypass for some nodes just because of DemandedElts. Fixes #149799
88 lines
4.3 KiB
LLVM
88 lines
4.3 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
|
|
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
|
|
|
|
define amdgpu_kernel void @v_sext_in_reg_i8_i16_shuffle_vector(ptr addrspace(1) %out, ptr addrspace(1) %ptr) #0 {
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: global_load_b64 v[1:2], v0, s[2:3]
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.h
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.h
|
|
; GFX11-TRUE16-NEXT: v_bfe_i32 v7, v2, 0, 8
|
|
; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v5, 24, v2
|
|
; GFX11-TRUE16-NEXT: v_ashrrev_i32_e32 v6, 24, v1
|
|
; GFX11-TRUE16-NEXT: v_bfe_i32 v8, v3, 0, 8
|
|
; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v1.l
|
|
; GFX11-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
|
|
; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.h, 8, v2.l
|
|
; GFX11-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.l
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v8.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.l, v0.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v0.h, v0.h
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.h, v6.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.h, v5.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.l, v1.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v1.h, v2.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v2.l, v3.l
|
|
; GFX11-TRUE16-NEXT: v_cvt_f16_i16_e32 v4.l, v4.l
|
|
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0
|
|
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v3, v0.l, v1.l
|
|
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v1, v0.h, v1.h
|
|
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v2, v2.h, v2.l
|
|
; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v4.h, v4.l
|
|
; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1]
|
|
; GFX11-TRUE16-NEXT: s_endpgm
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_sext_in_reg_i8_i16_shuffle_vector:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
|
|
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v10, 0
|
|
; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: global_load_b64 v[0:1], v0, s[2:3]
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1
|
|
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0
|
|
; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v3, 24, v1
|
|
; GFX11-FAKE16-NEXT: v_ashrrev_i32_e32 v5, 24, v0
|
|
; GFX11-FAKE16-NEXT: v_ashrrev_i16 v6, 8, v1
|
|
; GFX11-FAKE16-NEXT: v_bfe_i32 v7, v0, 0, 8
|
|
; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0
|
|
; GFX11-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8
|
|
; GFX11-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8
|
|
; GFX11-FAKE16-NEXT: v_bfe_i32 v4, v4, 0, 8
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v7, v7
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v0, v0
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v1, v1
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v6, v6
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v5, v5
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v8, v3
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v4, v4
|
|
; GFX11-FAKE16-NEXT: v_cvt_f16_i16_e32 v9, v2
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v0, v7
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v6, v1
|
|
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v5, v4
|
|
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v8, v9
|
|
; GFX11-FAKE16-NEXT: global_store_b128 v10, v[0:3], s[0:1]
|
|
; GFX11-FAKE16-NEXT: s_endpgm
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
|
|
%load = load <8 x i8>, ptr addrspace(1) %in.gep
|
|
%shuff = shufflevector <8 x i8> %load, <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
|
|
%cast = sitofp <8 x i8> %shuff to <8 x half>
|
|
store <8 x half> %cast, ptr addrspace(1) %out
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind readnone }
|