[GlobalISel] Remove UnsafeFPMath references (#146319)

This is the GlobalISel part to remove `UnsafeFPMath` flag in CodeGen
pipeline.
This commit is contained in:
paperchalice 2025-07-29 12:11:52 +08:00 committed by GitHub
parent b39160ddfb
commit ce86ff105b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 5618 additions and 2758 deletions

View File

@ -5949,8 +5949,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
const TargetOptions &Options = MF->getTarget().Options;
LLT DstType = MRI.getType(MI.getOperand(0).getReg());
if (CanReassociate &&
!(Options.UnsafeFPMath || MI.getFlag(MachineInstr::MIFlag::FmReassoc)))
if (CanReassociate && !MI.getFlag(MachineInstr::MIFlag::FmReassoc))
return false;
// Floating-point multiply-add with intermediate rounding.
@ -5962,8 +5961,7 @@ bool CombinerHelper::canCombineFMadOrFMA(MachineInstr &MI,
if (!HasFMAD && !HasFMA)
return false;
AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast ||
Options.UnsafeFPMath || HasFMAD;
AllowFusionGlobally = Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD;
// If the addition is not contractable, do not combine.
if (!AllowFusionGlobally && !MI.getFlag(MachineInstr::MIFlag::FmContract))
return false;

View File

@ -8004,7 +8004,7 @@ LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
return UnableToLegalize;
if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
if (MI.getFlag(MachineInstr::FmAfn)) {
unsigned Flags = MI.getFlags();
auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
MIRBuilder.buildFPTrunc(Dst, Src32, Flags);

File diff suppressed because it is too large Load Diff

View File

@ -24,8 +24,8 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
%6:_(s32) = G_FMUL %0, %1
%7:_(s32) = G_FADD %6, %el1
%6:_(s32) = contract G_FMUL %0, %1
%7:_(s32) = contract G_FADD %6, %el1
$vgpr0 = COPY %7(s32)
...
@ -54,8 +54,8 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
%6:_(s32) = G_FMUL %0, %1
%7:_(s32) = G_FADD %el1, %6
%6:_(s32) = contract G_FMUL %0, %1
%7:_(s32) = contract G_FADD %el1, %6
$vgpr0 = COPY %7(s32)
...
@ -233,10 +233,10 @@ body: |
%7:_(s16) = G_TRUNC %6(s32)
%8:_(s32) = COPY $vgpr5
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s16) = G_FMUL %7, %9
%10:_(s16) = contract G_FMUL %7, %9
%11:_(s32) = G_FPEXT %10(s16)
%12:_(s32) = G_FMA %0, %1, %11
%13:_(s32) = G_FADD %12, %el1
%13:_(s32) = contract G_FADD %12, %el1
$vgpr0 = COPY %13(s32)
...
@ -282,11 +282,11 @@ body: |
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s32) = COPY $vgpr5
%11:_(s16) = G_TRUNC %10(s32)
%12:_(s16) = G_FMUL %9, %11
%13:_(s16) = G_FMUL %1, %3
%14:_(s16) = G_FADD %13, %12
%12:_(s16) = contract G_FMUL %9, %11
%13:_(s16) = contract G_FMUL %1, %3
%14:_(s16) = contract G_FADD %13, %12
%15:_(s32) = G_FPEXT %14(s16)
%16:_(s32) = G_FADD %15, %el1
%16:_(s32) = contract G_FADD %15, %el1
$vgpr0 = COPY %16(s32)
...
@ -326,10 +326,10 @@ body: |
%7:_(s16) = G_TRUNC %6(s32)
%8:_(s32) = COPY $vgpr5
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s16) = G_FMUL %7, %9
%10:_(s16) = contract G_FMUL %7, %9
%11:_(s32) = G_FPEXT %10(s16)
%12:_(s32) = G_FMA %4, %5, %11
%13:_(s32) = G_FADD %el1, %12
%13:_(s32) = contract G_FADD %el1, %12
$vgpr0 = COPY %13(s32)
...
@ -375,11 +375,11 @@ body: |
%9:_(s16) = G_TRUNC %8(s32)
%10:_(s32) = COPY $vgpr5
%11:_(s16) = G_TRUNC %10(s32)
%12:_(s16) = G_FMUL %9, %11
%13:_(s16) = G_FMUL %5, %7
%14:_(s16) = G_FADD %13, %12
%12:_(s16) = contract G_FMUL %9, %11
%13:_(s16) = contract G_FMUL %5, %7
%14:_(s16) = contract G_FADD %13, %12
%15:_(s32) = G_FPEXT %14(s16)
%16:_(s32) = G_FADD %el1, %15
%16:_(s32) = contract G_FADD %el1, %15
$vgpr0 = COPY %16(s32)
...
@ -409,8 +409,8 @@ body: |
%ptr:_(p1) = COPY $vgpr0_vgpr1
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
%6:_(s32) = G_FMUL %0, %1
%7:_(s32) = G_FSUB %6, %el1
%6:_(s32) = contract G_FMUL %0, %1
%7:_(s32) = contract G_FSUB %6, %el1
$vgpr0 = COPY %7(s32)
...
@ -440,7 +440,7 @@ body: |
%ptr:_(p1) = COPY $vgpr2_vgpr3
%vec:_(<2 x s32>) = G_LOAD %ptr(p1) :: (load (<2 x s32>), addrspace 1)
%el0:_(s32), %el1:_(s32) = G_UNMERGE_VALUES %vec(<2 x s32>)
%6:_(s32) = G_FMUL %0, %1
%7:_(s32) = G_FSUB %el1, %6
%6:_(s32) = contract G_FMUL %0, %1
%7:_(s32) = contract G_FSUB %el1, %6
$vgpr0 = COPY %7(s32)
...

View File

@ -385,117 +385,16 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2047
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1008
; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[C2]]
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C3]](s32)
; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4094
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 511
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C5]]
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[UV2]]
; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR]](s32), [[C6]]
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[ZEXT]]
; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 512
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR1]](s32), [[C6]]
; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C7]], [[C6]]
; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 31744
; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT]], [[C8]]
; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C9]](s32)
; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL]]
; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD]]
; CHECK-NEXT: [[SMAX:%[0-9]+]]:_(s32) = G_SMAX [[SUB]], [[C6]]
; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
; CHECK-NEXT: [[SMIN:%[0-9]+]]:_(s32) = G_SMIN [[SMAX]], [[C11]]
; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 4096
; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[C12]]
; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[OR4]], [[SMIN]](s32)
; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[SMIN]](s32)
; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL1]](s32), [[OR4]]
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP2]](s1)
; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[ZEXT1]]
; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[C10]]
; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[OR5]], [[OR3]]
; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C13]]
; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C14]](s32)
; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND3]](s32), [[C15]]
; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP4]](s1)
; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND3]](s32), [[C16]]
; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP5]](s1)
; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[ZEXT3]]
; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR3]], [[OR6]]
; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD]](s32), [[C17]]
; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C8]], [[ADD1]]
; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1039
; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD]](s32), [[C18]]
; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[OR2]], [[SELECT2]]
; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[UV3]], [[C19]](s32)
; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C20]]
; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SELECT3]]
; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C]](s32)
; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND5]], [[C2]]
; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C3]](s32)
; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C5]]
; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[UV4]]
; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR8]](s32), [[C6]]
; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP8]](s1)
; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[ZEXT4]]
; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[OR9]](s32), [[C6]]
; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s1), [[C7]], [[C6]]
; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[C8]]
; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ADD2]], [[C9]](s32)
; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL2]]
; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C10]], [[ADD2]]
; CHECK-NEXT: [[SMAX1:%[0-9]+]]:_(s32) = G_SMAX [[SUB1]], [[C6]]
; CHECK-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[C11]]
; CHECK-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[C12]]
; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[OR12]], [[SMIN1]](s32)
; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[SMIN1]](s32)
; CHECK-NEXT: [[ICMP10:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL3]](s32), [[OR12]]
; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP10]](s1)
; CHECK-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[ZEXT5]]
; CHECK-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD2]](s32), [[C10]]
; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP11]](s1), [[OR13]], [[OR11]]
; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C13]]
; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT5]], [[C14]](s32)
; CHECK-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND8]](s32), [[C15]]
; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP12]](s1)
; CHECK-NEXT: [[ICMP13:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[AND8]](s32), [[C16]]
; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP13]](s1)
; CHECK-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[ZEXT7]]
; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR8]], [[OR14]]
; CHECK-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[ADD2]](s32), [[C17]]
; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP14]](s1), [[C8]], [[ADD3]]
; CHECK-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ADD2]](s32), [[C18]]
; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP15]](s1), [[OR10]], [[SELECT6]]
; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[UV5]], [[C19]](s32)
; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C20]]
; CHECK-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SELECT7]]
; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[OR7]], [[C21]]
; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[OR15]], [[C21]]
; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C19]](s32)
; CHECK-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL4]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR16]](s32)
; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV]](s64)
; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC]](s32)
; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s32) = afn G_FPTRUNC [[UV1]](s64)
; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = afn G_FPTRUNC [[FPTRUNC2]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC1]](s16)
; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[FPTRUNC3]](s16)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
%1:_(<2 x s16>) = afn G_FPTRUNC %0

View File

@ -1,16 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=0 < %s | FileCheck -enable-var-scope -check-prefixes=SI-SDAG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -global-isel=1 < %s | FileCheck -check-prefixes=SI-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-SDAG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=VI-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-SDAG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-SDAG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -mattr=-flat-for-global -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX950-GISEL %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-SDAG-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-TRUE16 %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 -denormal-fp-math=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GFX11-GISEL-FAKE16 %s
define amdgpu_kernel void @fptrunc_f32_to_f16(
; SI-SDAG-LABEL: fptrunc_f32_to_f16:
@ -457,9 +457,49 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; SI-GISEL-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014
; SI-GISEL-NEXT: s_lshr_b32 s6, s5, 8
; SI-GISEL-NEXT: s_and_b32 s7, s5, 0x1ff
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s6, s6, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s7, s4
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s6, s4
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s7, s3, 12
; SI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; SI-GISEL-NEXT: s_or_b32 s9, s4, 0x1000
; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00
; SI-GISEL-NEXT: s_or_b32 s4, s4, s7
; SI-GISEL-NEXT: s_max_i32 s7, s8, 0
; SI-GISEL-NEXT: s_min_i32 s7, s7, 13
; SI-GISEL-NEXT: s_lshr_b32 s8, s9, s7
; SI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
; SI-GISEL-NEXT: s_cmp_lg_u32 s7, s9
; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
; SI-GISEL-NEXT: s_or_b32 s7, s8, s7
; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
; SI-GISEL-NEXT: s_cselect_b32 s4, s7, s4
; SI-GISEL-NEXT: s_and_b32 s7, s4, 7
; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
; SI-GISEL-NEXT: s_cmp_eq_u32 s7, 3
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_cmp_gt_i32 s7, 5
; SI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
; SI-GISEL-NEXT: s_or_b32 s7, s8, s7
; SI-GISEL-NEXT: s_add_i32 s4, s4, s7
; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
; SI-GISEL-NEXT: s_cselect_b32 s3, s6, s4
; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_or_b32 s4, s4, s3
; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; SI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
@ -529,10 +569,50 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; VI-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s5, s3, 8
; VI-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; VI-GISEL-NEXT: s_or_b32 s2, s6, s2
; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s5, s2
; VI-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s7, 1, s4
; VI-GISEL-NEXT: s_lshl_b32 s6, s4, 12
; VI-GISEL-NEXT: s_max_i32 s7, s7, 0
; VI-GISEL-NEXT: s_or_b32 s6, s2, s6
; VI-GISEL-NEXT: s_min_i32 s7, s7, 13
; VI-GISEL-NEXT: s_bitset1_b32 s2, 12
; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
; VI-GISEL-NEXT: s_lshr_b32 s8, s2, s7
; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
; VI-GISEL-NEXT: s_lshl_b32 s7, s8, s7
; VI-GISEL-NEXT: s_cmp_lg_u32 s7, s2
; VI-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; VI-GISEL-NEXT: s_or_b32 s2, s8, s2
; VI-GISEL-NEXT: s_cmp_lt_i32 s4, 1
; VI-GISEL-NEXT: s_cselect_b32 s2, s2, s6
; VI-GISEL-NEXT: s_and_b32 s6, s2, 7
; VI-GISEL-NEXT: s_lshr_b32 s2, s2, 2
; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
; VI-GISEL-NEXT: s_cselect_b32 s7, 1, 0
; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; VI-GISEL-NEXT: s_or_b32 s6, s7, s6
; VI-GISEL-NEXT: s_add_i32 s2, s2, s6
; VI-GISEL-NEXT: s_cmp_gt_i32 s4, 30
; VI-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
; VI-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; VI-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 16
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
; VI-GISEL-NEXT: s_mov_b32 s2, -1
; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; VI-GISEL-NEXT: s_endpgm
;
@ -602,10 +682,50 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX9-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s5, s3, 8
; GFX9-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s5, s2
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s4, 12
; GFX9-GISEL-NEXT: s_max_i32 s7, s7, 0
; GFX9-GISEL-NEXT: s_or_b32 s6, s2, s6
; GFX9-GISEL-NEXT: s_min_i32 s7, s7, 13
; GFX9-GISEL-NEXT: s_bitset1_b32 s2, 12
; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9
; GFX9-GISEL-NEXT: s_lshr_b32 s8, s2, s7
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX9-GISEL-NEXT: s_lshl_b32 s7, s8, s7
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s7, s2
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s2, s8, s2
; GFX9-GISEL-NEXT: s_cmp_lt_i32 s4, 1
; GFX9-GISEL-NEXT: s_cselect_b32 s2, s2, s6
; GFX9-GISEL-NEXT: s_and_b32 s6, s2, 7
; GFX9-GISEL-NEXT: s_lshr_b32 s2, s2, 2
; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3
; GFX9-GISEL-NEXT: s_cselect_b32 s7, 1, 0
; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s6, s7, s6
; GFX9-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX9-GISEL-NEXT: s_cmp_gt_i32 s4, 30
; GFX9-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX9-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
@ -675,8 +795,48 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX950-GISEL-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s5, s3, 8
; GFX950-GISEL-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s4, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s2, s6, s2
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s5, s2
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s7, 1, s4
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s4, 12
; GFX950-GISEL-NEXT: s_max_i32 s7, s7, 0
; GFX950-GISEL-NEXT: s_or_b32 s6, s2, s6
; GFX950-GISEL-NEXT: s_min_i32 s7, s7, 13
; GFX950-GISEL-NEXT: s_bitset1_b32 s2, 12
; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9
; GFX950-GISEL-NEXT: s_lshr_b32 s8, s2, s7
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX950-GISEL-NEXT: s_lshl_b32 s7, s8, s7
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s7, s2
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s2, s8, s2
; GFX950-GISEL-NEXT: s_cmp_lt_i32 s4, 1
; GFX950-GISEL-NEXT: s_cselect_b32 s2, s2, s6
; GFX950-GISEL-NEXT: s_and_b32 s6, s2, 7
; GFX950-GISEL-NEXT: s_lshr_b32 s2, s2, 2
; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3
; GFX950-GISEL-NEXT: s_cselect_b32 s7, 1, 0
; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s6, s7, s6
; GFX950-GISEL-NEXT: s_add_i32 s2, s2, s6
; GFX950-GISEL-NEXT: s_cmp_gt_i32 s4, 30
; GFX950-GISEL-NEXT: s_cselect_b32 s2, 0x7c00, s2
; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX950-GISEL-NEXT: s_cselect_b32 s2, s5, s2
; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 16
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX950-GISEL-NEXT: buffer_store_short v0, off, s[0:3], 0
@ -822,11 +982,54 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s3, 8
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s5, s2
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s2, 0x1000
; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s7, s4, 12
; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s9, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s2, s7
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s9, s6
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s8
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s9, s6
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s6, s2
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s2, 7
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s2, s2, 2
; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s7, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s7, s6
; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s2, s2, s6
; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, 0x7c00, s2
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s5, s2
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 16
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
@ -836,11 +1039,54 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[2:3]
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s3, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s3, 8
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s5, s2
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s2, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s2, 0x1000
; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s7, s4, 12
; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s9, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s2, s7
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s9, s6
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s8
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s9, s6
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s6, s2
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s2, 7
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s2, s2, 2
; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s7, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s7, s6
; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s2, s2, s6
; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, 0x7c00, s2
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s5, s2
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 16
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0
; GFX11-GISEL-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,
@ -1644,13 +1890,94 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; SI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; SI-GISEL-NEXT: s_mov_b32 s2, -1
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; SI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; SI-GISEL-NEXT: s_bfe_u32 s3, s5, 0xb0014
; SI-GISEL-NEXT: s_lshr_b32 s8, s5, 8
; SI-GISEL-NEXT: s_and_b32 s9, s5, 0x1ff
; SI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s4, s9, s4
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s8, s8, 9
; SI-GISEL-NEXT: s_lshl_b32 s9, s3, 12
; SI-GISEL-NEXT: s_sub_i32 s10, 1, s3
; SI-GISEL-NEXT: s_or_b32 s11, s4, 0x1000
; SI-GISEL-NEXT: s_or_b32 s8, s8, 0x7c00
; SI-GISEL-NEXT: s_or_b32 s4, s4, s9
; SI-GISEL-NEXT: s_max_i32 s9, s10, 0
; SI-GISEL-NEXT: s_min_i32 s9, s9, 13
; SI-GISEL-NEXT: s_lshr_b32 s10, s11, s9
; SI-GISEL-NEXT: s_lshl_b32 s9, s10, s9
; SI-GISEL-NEXT: s_cmp_lg_u32 s9, s11
; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
; SI-GISEL-NEXT: s_or_b32 s9, s10, s9
; SI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
; SI-GISEL-NEXT: s_cselect_b32 s4, s9, s4
; SI-GISEL-NEXT: s_and_b32 s9, s4, 7
; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
; SI-GISEL-NEXT: s_cmp_eq_u32 s9, 3
; SI-GISEL-NEXT: s_cselect_b32 s10, 1, 0
; SI-GISEL-NEXT: s_cmp_gt_i32 s9, 5
; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
; SI-GISEL-NEXT: s_or_b32 s9, s10, s9
; SI-GISEL-NEXT: s_add_i32 s4, s4, s9
; SI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
; SI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
; SI-GISEL-NEXT: s_cselect_b32 s3, s8, s4
; SI-GISEL-NEXT: s_lshr_b32 s4, s5, 16
; SI-GISEL-NEXT: s_bfe_u32 s5, s7, 0xb0014
; SI-GISEL-NEXT: s_lshr_b32 s8, s7, 8
; SI-GISEL-NEXT: s_and_b32 s9, s7, 0x1ff
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; SI-GISEL-NEXT: s_addk_i32 s5, 0xfc10
; SI-GISEL-NEXT: s_and_b32 s8, s8, 0xffe
; SI-GISEL-NEXT: s_or_b32 s6, s9, s6
; SI-GISEL-NEXT: s_or_b32 s3, s4, s3
; SI-GISEL-NEXT: s_cmp_lg_u32 s6, 0
; SI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; SI-GISEL-NEXT: s_or_b32 s4, s8, s4
; SI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; SI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; SI-GISEL-NEXT: s_lshl_b32 s6, s6, 9
; SI-GISEL-NEXT: s_lshl_b32 s8, s5, 12
; SI-GISEL-NEXT: s_sub_i32 s9, 1, s5
; SI-GISEL-NEXT: s_or_b32 s10, s4, 0x1000
; SI-GISEL-NEXT: s_or_b32 s6, s6, 0x7c00
; SI-GISEL-NEXT: s_or_b32 s4, s4, s8
; SI-GISEL-NEXT: s_max_i32 s8, s9, 0
; SI-GISEL-NEXT: s_min_i32 s8, s8, 13
; SI-GISEL-NEXT: s_lshr_b32 s9, s10, s8
; SI-GISEL-NEXT: s_lshl_b32 s8, s9, s8
; SI-GISEL-NEXT: s_cmp_lg_u32 s8, s10
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_or_b32 s8, s9, s8
; SI-GISEL-NEXT: s_cmp_lt_i32 s5, 1
; SI-GISEL-NEXT: s_cselect_b32 s4, s8, s4
; SI-GISEL-NEXT: s_and_b32 s8, s4, 7
; SI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
; SI-GISEL-NEXT: s_cmp_eq_u32 s8, 3
; SI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
; SI-GISEL-NEXT: s_cmp_gt_i32 s8, 5
; SI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; SI-GISEL-NEXT: s_or_b32 s8, s9, s8
; SI-GISEL-NEXT: s_add_i32 s4, s4, s8
; SI-GISEL-NEXT: s_cmp_gt_i32 s5, 30
; SI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
; SI-GISEL-NEXT: s_cmpk_eq_i32 s5, 0x40f
; SI-GISEL-NEXT: s_cselect_b32 s4, s6, s4
; SI-GISEL-NEXT: s_lshr_b32 s5, s7, 16
; SI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
; SI-GISEL-NEXT: s_and_b32 s5, s5, 0x8000
; SI-GISEL-NEXT: s_or_b32 s4, s5, s4
; SI-GISEL-NEXT: s_and_b32 s4, s4, 0xffff
; SI-GISEL-NEXT: s_lshl_b32 s4, s4, 16
; SI-GISEL-NEXT: s_or_b32 s4, s3, s4
; SI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; SI-GISEL-NEXT: v_mov_b32_e32 v0, s4
; SI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-GISEL-NEXT: s_endpgm
;
@ -1763,14 +2090,96 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; VI-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 8
; VI-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; VI-GISEL-NEXT: s_or_b32 s4, s8, s4
; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s3, s4
; VI-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s9, 1, s2
; VI-GISEL-NEXT: s_lshl_b32 s8, s2, 12
; VI-GISEL-NEXT: s_max_i32 s9, s9, 0
; VI-GISEL-NEXT: s_or_b32 s8, s3, s8
; VI-GISEL-NEXT: s_min_i32 s9, s9, 13
; VI-GISEL-NEXT: s_bitset1_b32 s3, 12
; VI-GISEL-NEXT: s_lshl_b32 s4, s4, 9
; VI-GISEL-NEXT: s_lshr_b32 s10, s3, s9
; VI-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
; VI-GISEL-NEXT: s_lshl_b32 s9, s10, s9
; VI-GISEL-NEXT: s_cmp_lg_u32 s9, s3
; VI-GISEL-NEXT: s_cselect_b32 s3, 1, 0
; VI-GISEL-NEXT: s_or_b32 s3, s10, s3
; VI-GISEL-NEXT: s_cmp_lt_i32 s2, 1
; VI-GISEL-NEXT: s_cselect_b32 s3, s3, s8
; VI-GISEL-NEXT: s_and_b32 s8, s3, 7
; VI-GISEL-NEXT: s_lshr_b32 s3, s3, 2
; VI-GISEL-NEXT: s_cmp_eq_u32 s8, 3
; VI-GISEL-NEXT: s_cselect_b32 s9, 1, 0
; VI-GISEL-NEXT: s_cmp_gt_i32 s8, 5
; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; VI-GISEL-NEXT: s_or_b32 s8, s9, s8
; VI-GISEL-NEXT: s_add_i32 s3, s3, s8
; VI-GISEL-NEXT: s_cmp_gt_i32 s2, 30
; VI-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
; VI-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
; VI-GISEL-NEXT: s_cselect_b32 s2, s4, s3
; VI-GISEL-NEXT: s_lshr_b32 s3, s5, 16
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
; VI-GISEL-NEXT: s_or_b32 s2, s3, s2
; VI-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 8
; VI-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; VI-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
; VI-GISEL-NEXT: s_or_b32 s5, s5, s6
; VI-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s4, s5
; VI-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; VI-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; VI-GISEL-NEXT: s_sub_i32 s8, 1, s3
; VI-GISEL-NEXT: s_lshl_b32 s6, s3, 12
; VI-GISEL-NEXT: s_max_i32 s8, s8, 0
; VI-GISEL-NEXT: s_or_b32 s6, s4, s6
; VI-GISEL-NEXT: s_min_i32 s8, s8, 13
; VI-GISEL-NEXT: s_bitset1_b32 s4, 12
; VI-GISEL-NEXT: s_lshl_b32 s5, s5, 9
; VI-GISEL-NEXT: s_lshr_b32 s9, s4, s8
; VI-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
; VI-GISEL-NEXT: s_lshl_b32 s8, s9, s8
; VI-GISEL-NEXT: s_cmp_lg_u32 s8, s4
; VI-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; VI-GISEL-NEXT: s_or_b32 s4, s9, s4
; VI-GISEL-NEXT: s_cmp_lt_i32 s3, 1
; VI-GISEL-NEXT: s_cselect_b32 s4, s4, s6
; VI-GISEL-NEXT: s_and_b32 s6, s4, 7
; VI-GISEL-NEXT: s_lshr_b32 s4, s4, 2
; VI-GISEL-NEXT: s_cmp_eq_u32 s6, 3
; VI-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; VI-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; VI-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; VI-GISEL-NEXT: s_or_b32 s6, s8, s6
; VI-GISEL-NEXT: s_add_i32 s4, s4, s6
; VI-GISEL-NEXT: s_cmp_gt_i32 s3, 30
; VI-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
; VI-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
; VI-GISEL-NEXT: s_cselect_b32 s3, s5, s4
; VI-GISEL-NEXT: s_lshr_b32 s4, s7, 16
; VI-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; VI-GISEL-NEXT: s_or_b32 s3, s4, s3
; VI-GISEL-NEXT: s_and_b32 s3, s3, 0xffff
; VI-GISEL-NEXT: s_and_b32 s2, s2, 0xffff
; VI-GISEL-NEXT: s_lshl_b32 s3, s3, 16
; VI-GISEL-NEXT: s_or_b32 s2, s2, s3
; VI-GISEL-NEXT: v_mov_b32_e32 v0, s2
; VI-GISEL-NEXT: s_mov_b32 s2, -1
; VI-GISEL-NEXT: s_mov_b32 s3, 0xf000
; VI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; VI-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; VI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-GISEL-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-GISEL-NEXT: v_or_b32_e32 v0, v0, v1
; VI-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-GISEL-NEXT: s_endpgm
;
@ -1881,14 +2290,93 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX9-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 8
; GFX9-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s4, s8, s4
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s3, s4
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s2, 12
; GFX9-GISEL-NEXT: s_max_i32 s9, s9, 0
; GFX9-GISEL-NEXT: s_or_b32 s8, s3, s8
; GFX9-GISEL-NEXT: s_min_i32 s9, s9, 13
; GFX9-GISEL-NEXT: s_bitset1_b32 s3, 12
; GFX9-GISEL-NEXT: s_lshl_b32 s4, s4, 9
; GFX9-GISEL-NEXT: s_lshr_b32 s10, s3, s9
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
; GFX9-GISEL-NEXT: s_lshl_b32 s9, s10, s9
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s9, s3
; GFX9-GISEL-NEXT: s_cselect_b32 s3, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s3, s10, s3
; GFX9-GISEL-NEXT: s_cmp_lt_i32 s2, 1
; GFX9-GISEL-NEXT: s_cselect_b32 s3, s3, s8
; GFX9-GISEL-NEXT: s_and_b32 s8, s3, 7
; GFX9-GISEL-NEXT: s_lshr_b32 s3, s3, 2
; GFX9-GISEL-NEXT: s_cmp_eq_u32 s8, 3
; GFX9-GISEL-NEXT: s_cselect_b32 s9, 1, 0
; GFX9-GISEL-NEXT: s_cmp_gt_i32 s8, 5
; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s8, s9, s8
; GFX9-GISEL-NEXT: s_add_i32 s3, s3, s8
; GFX9-GISEL-NEXT: s_cmp_gt_i32 s2, 30
; GFX9-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX9-GISEL-NEXT: s_cselect_b32 s2, s4, s3
; GFX9-GISEL-NEXT: s_lshr_b32 s3, s5, 16
; GFX9-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
; GFX9-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX9-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 8
; GFX9-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX9-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, s6
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s4, s5
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX9-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX9-GISEL-NEXT: s_lshl_b32 s6, s3, 12
; GFX9-GISEL-NEXT: s_max_i32 s8, s8, 0
; GFX9-GISEL-NEXT: s_or_b32 s6, s4, s6
; GFX9-GISEL-NEXT: s_min_i32 s8, s8, 13
; GFX9-GISEL-NEXT: s_bitset1_b32 s4, 12
; GFX9-GISEL-NEXT: s_lshl_b32 s5, s5, 9
; GFX9-GISEL-NEXT: s_lshr_b32 s9, s4, s8
; GFX9-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX9-GISEL-NEXT: s_lshl_b32 s8, s9, s8
; GFX9-GISEL-NEXT: s_cmp_lg_u32 s8, s4
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s4, s9, s4
; GFX9-GISEL-NEXT: s_cmp_lt_i32 s3, 1
; GFX9-GISEL-NEXT: s_cselect_b32 s4, s4, s6
; GFX9-GISEL-NEXT: s_and_b32 s6, s4, 7
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s4, 2
; GFX9-GISEL-NEXT: s_cmp_eq_u32 s6, 3
; GFX9-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; GFX9-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; GFX9-GISEL-NEXT: s_or_b32 s6, s8, s6
; GFX9-GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX9-GISEL-NEXT: s_cmp_gt_i32 s3, 30
; GFX9-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
; GFX9-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
; GFX9-GISEL-NEXT: s_cselect_b32 s3, s5, s4
; GFX9-GISEL-NEXT: s_lshr_b32 s4, s7, 16
; GFX9-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; GFX9-GISEL-NEXT: s_or_b32 s3, s4, s3
; GFX9-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX9-GISEL-NEXT: s_mov_b32 s2, -1
; GFX9-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX9-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-GISEL-NEXT: s_endpgm
;
@ -1999,14 +2487,93 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX950-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 8
; GFX950-GISEL-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s2, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s4, s8, s4
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s3, s4
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s9, 1, s2
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s2, 12
; GFX950-GISEL-NEXT: s_max_i32 s9, s9, 0
; GFX950-GISEL-NEXT: s_or_b32 s8, s3, s8
; GFX950-GISEL-NEXT: s_min_i32 s9, s9, 13
; GFX950-GISEL-NEXT: s_bitset1_b32 s3, 12
; GFX950-GISEL-NEXT: s_lshl_b32 s4, s4, 9
; GFX950-GISEL-NEXT: s_lshr_b32 s10, s3, s9
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, 0x7c00
; GFX950-GISEL-NEXT: s_lshl_b32 s9, s10, s9
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s9, s3
; GFX950-GISEL-NEXT: s_cselect_b32 s3, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s3, s10, s3
; GFX950-GISEL-NEXT: s_cmp_lt_i32 s2, 1
; GFX950-GISEL-NEXT: s_cselect_b32 s3, s3, s8
; GFX950-GISEL-NEXT: s_and_b32 s8, s3, 7
; GFX950-GISEL-NEXT: s_lshr_b32 s3, s3, 2
; GFX950-GISEL-NEXT: s_cmp_eq_u32 s8, 3
; GFX950-GISEL-NEXT: s_cselect_b32 s9, 1, 0
; GFX950-GISEL-NEXT: s_cmp_gt_i32 s8, 5
; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s8, s9, s8
; GFX950-GISEL-NEXT: s_add_i32 s3, s3, s8
; GFX950-GISEL-NEXT: s_cmp_gt_i32 s2, 30
; GFX950-GISEL-NEXT: s_cselect_b32 s3, 0x7c00, s3
; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX950-GISEL-NEXT: s_cselect_b32 s2, s4, s3
; GFX950-GISEL-NEXT: s_lshr_b32 s3, s5, 16
; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0x8000
; GFX950-GISEL-NEXT: s_or_b32 s2, s3, s2
; GFX950-GISEL-NEXT: s_bfe_u32 s3, s7, 0xb0014
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 8
; GFX950-GISEL-NEXT: s_and_b32 s5, s7, 0x1ff
; GFX950-GISEL-NEXT: s_addk_i32 s3, 0xfc10
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffe
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, s6
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s5, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s4, s5
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s4, 0
; GFX950-GISEL-NEXT: s_cselect_b32 s5, 1, 0
; GFX950-GISEL-NEXT: s_sub_i32 s8, 1, s3
; GFX950-GISEL-NEXT: s_lshl_b32 s6, s3, 12
; GFX950-GISEL-NEXT: s_max_i32 s8, s8, 0
; GFX950-GISEL-NEXT: s_or_b32 s6, s4, s6
; GFX950-GISEL-NEXT: s_min_i32 s8, s8, 13
; GFX950-GISEL-NEXT: s_bitset1_b32 s4, 12
; GFX950-GISEL-NEXT: s_lshl_b32 s5, s5, 9
; GFX950-GISEL-NEXT: s_lshr_b32 s9, s4, s8
; GFX950-GISEL-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX950-GISEL-NEXT: s_lshl_b32 s8, s9, s8
; GFX950-GISEL-NEXT: s_cmp_lg_u32 s8, s4
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s4, s9, s4
; GFX950-GISEL-NEXT: s_cmp_lt_i32 s3, 1
; GFX950-GISEL-NEXT: s_cselect_b32 s4, s4, s6
; GFX950-GISEL-NEXT: s_and_b32 s6, s4, 7
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s4, 2
; GFX950-GISEL-NEXT: s_cmp_eq_u32 s6, 3
; GFX950-GISEL-NEXT: s_cselect_b32 s8, 1, 0
; GFX950-GISEL-NEXT: s_cmp_gt_i32 s6, 5
; GFX950-GISEL-NEXT: s_cselect_b32 s6, 1, 0
; GFX950-GISEL-NEXT: s_or_b32 s6, s8, s6
; GFX950-GISEL-NEXT: s_add_i32 s4, s4, s6
; GFX950-GISEL-NEXT: s_cmp_gt_i32 s3, 30
; GFX950-GISEL-NEXT: s_cselect_b32 s4, 0x7c00, s4
; GFX950-GISEL-NEXT: s_cmpk_eq_i32 s3, 0x40f
; GFX950-GISEL-NEXT: s_cselect_b32 s3, s5, s4
; GFX950-GISEL-NEXT: s_lshr_b32 s4, s7, 16
; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0x8000
; GFX950-GISEL-NEXT: s_or_b32 s3, s4, s3
; GFX950-GISEL-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX950-GISEL-NEXT: s_mov_b32 s2, -1
; GFX950-GISEL-NEXT: s_mov_b32 s3, 0xf000
; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX950-GISEL-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX950-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX950-GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX950-GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX950-GISEL-NEXT: s_endpgm
;
@ -2247,16 +2814,101 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 8
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s10, s3, 0x1000
; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s8, s8, 0
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s9, s2, 12
; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s8, s8, 13
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s4, s4, 9
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s11, s10, s8
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s9
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s11, s8
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s4, s4, 0x7c00
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s8, s10
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s11, s8
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s8, s3
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s3, 7
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s8, 3
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s9, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s8, 5
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s8, s9, s8
; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s8
; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s2, 30
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s5, 16
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-TRUE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s3, s3, 0x8000
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s2, s3, s2
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s5, s3
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s9, s3, 0x1000
; GFX11-GISEL-TRUE16-NEXT: s_max_i32 s6, s6, 0
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s8, s4, 12
; GFX11-GISEL-TRUE16-NEXT: s_min_i32 s6, s6, 13
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s5, s5, 9
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s10, s9, s6
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s3, s8
; GFX11-GISEL-TRUE16-NEXT: s_lshl_b32 s6, s10, s6
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lg_u32 s6, s9
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s10, s6
; GFX11-GISEL-TRUE16-NEXT: s_cmp_lt_i32 s4, 1
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s6, s3
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s6, s3, 7
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
; GFX11-GISEL-TRUE16-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-TRUE16-NEXT: s_add_i32 s3, s3, s6
; GFX11-GISEL-TRUE16-NEXT: s_cmp_gt_i32 s4, 30
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
; GFX11-GISEL-TRUE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX11-GISEL-TRUE16-NEXT: s_cselect_b32 s3, s5, s3
; GFX11-GISEL-TRUE16-NEXT: s_lshr_b32 s4, s7, 16
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_and_b32 s4, s4, 0x8000
; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-TRUE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-GISEL-TRUE16-NEXT: s_endpgm
;
@ -2265,16 +2917,101 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-FAKE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
; GFX11-GISEL-FAKE16-NEXT: v_cvt_f32_f64_e32 v1, s[6:7]
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-GISEL-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-GISEL-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s5, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s2, s5, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 8
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s8, s4
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s4, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s4
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s8, 1, s2
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s10, s3, 0x1000
; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s8, s8, 0
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s9, s2, 12
; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s8, s8, 13
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s4, s4, 9
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s11, s10, s8
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s9
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s11, s8
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s4, s4, 0x7c00
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s8, s10
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s11, s8
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s8, s3
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s3, 7
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s8, 3
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s9, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s8, 5
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s8, s9, s8
; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s8
; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s2, 30
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s2, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s5, 16
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s8, s7, 0x1ff
; GFX11-GISEL-FAKE16-NEXT: s_bfe_u32 s4, s7, 0xb0014
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s5, s7, 8
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s3, s3, 0x8000
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_addk_i32 s4, 0xfc10
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s2, s3, s2
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s5, s3
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_sub_i32 s6, 1, s4
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s9, s3, 0x1000
; GFX11-GISEL-FAKE16-NEXT: s_max_i32 s6, s6, 0
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s8, s4, 12
; GFX11-GISEL-FAKE16-NEXT: s_min_i32 s6, s6, 13
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s5, s5, 9
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s10, s9, s6
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s3, s8
; GFX11-GISEL-FAKE16-NEXT: s_lshl_b32 s6, s10, s6
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s5, s5, 0x7c00
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lg_u32 s6, s9
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s10, s6
; GFX11-GISEL-FAKE16-NEXT: s_cmp_lt_i32 s4, 1
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s6, s3
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s6, s3, 7
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
; GFX11-GISEL-FAKE16-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s6, s8, s6
; GFX11-GISEL-FAKE16-NEXT: s_add_i32 s3, s3, s6
; GFX11-GISEL-FAKE16-NEXT: s_cmp_gt_i32 s4, 30
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, 0x7c00, s3
; GFX11-GISEL-FAKE16-NEXT: s_cmpk_eq_i32 s4, 0x40f
; GFX11-GISEL-FAKE16-NEXT: s_cselect_b32 s3, s5, s3
; GFX11-GISEL-FAKE16-NEXT: s_lshr_b32 s4, s7, 16
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_and_b32 s4, s4, 0x8000
; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s3, s4, s3
; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-GISEL-FAKE16-NEXT: s_pack_ll_b32_b16 s2, s2, s3
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s3, 0x31016000
; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-GISEL-FAKE16-NEXT: s_mov_b32 s2, -1
; GFX11-GISEL-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX11-GISEL-FAKE16-NEXT: s_endpgm
ptr addrspace(1) %r,

File diff suppressed because it is too large Load Diff