AMDGPU: Use pattern to select instruction for intrinsic llvm.fptrunc.round (#105761)
Use GCNPat instead of Custom Lowering to select instructions for intrinsic llvm.fptrunc.round. "SupportedRoundMode : TImmLeaf" is used as a predicate to select only when the rounding mode is supported. "as_hw_round_mode : SDNodeXForm" is developed to translate the round modes to the corresponding ones that hardware recognizes.
This commit is contained in:
parent
22ba351108
commit
26b0bef192
@ -161,6 +161,7 @@ def : GINodeEquiv<G_FFLOOR, ffloor>;
|
||||
def : GINodeEquiv<G_FRINT, frint>;
|
||||
def : GINodeEquiv<G_FNEARBYINT, fnearbyint>;
|
||||
def : GINodeEquiv<G_INTRINSIC_TRUNC, ftrunc>;
|
||||
def : GINodeEquiv<G_INTRINSIC_FPTRUNC_ROUND, fptrunc_round>;
|
||||
def : GINodeEquiv<G_INTRINSIC_ROUND, fround>;
|
||||
def : GINodeEquiv<G_INTRINSIC_ROUNDEVEN, froundeven>;
|
||||
def : GINodeEquiv<G_INTRINSIC_LRINT, lrint>;
|
||||
|
||||
@ -158,6 +158,9 @@ def SDTFPUnaryOp : SDTypeProfile<1, 1, [ // fneg, fsqrt, etc
|
||||
def SDTFPRoundOp : SDTypeProfile<1, 1, [ // fpround
|
||||
SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
|
||||
]>;
|
||||
def SDTFPTruncRoundOp : SDTypeProfile<1, 2, [
|
||||
SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
|
||||
]>;
|
||||
def SDTFPExtendOp : SDTypeProfile<1, 1, [ // fpextend
|
||||
SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
|
||||
]>;
|
||||
@ -552,6 +555,8 @@ def llround : SDNode<"ISD::LLROUND" , SDTFPToIntOp>;
|
||||
def lrint : SDNode<"ISD::LRINT" , SDTFPToIntOp>;
|
||||
def llrint : SDNode<"ISD::LLRINT" , SDTFPToIntOp>;
|
||||
|
||||
def fptrunc_round : SDNode<"ISD::FPTRUNC_ROUND", SDTFPTruncRoundOp>;
|
||||
|
||||
def fpround : SDNode<"ISD::FP_ROUND" , SDTFPRoundOp>;
|
||||
def fpextend : SDNode<"ISD::FP_EXTEND" , SDTFPExtendOp>;
|
||||
def fcopysign : SDNode<"ISD::FCOPYSIGN" , SDTFPSignOp>;
|
||||
|
||||
@ -297,8 +297,6 @@ def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
|
||||
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
|
||||
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
|
||||
|
||||
def : GINodeEquiv<G_FPTRUNC_ROUND, SIfptrunc_round>;
|
||||
|
||||
class GISelSop2Pat <
|
||||
SDPatternOperator node,
|
||||
Instruction inst,
|
||||
@ -419,3 +417,6 @@ def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameInde
|
||||
|
||||
def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
|
||||
GISDNodeXFormEquiv<FPPow2ToExponentXForm>;
|
||||
|
||||
def gi_as_hw_round_mode : GICustomOperandRenderer<"renderRoundMode">,
|
||||
GISDNodeXFormEquiv<as_hw_round_mode>;
|
||||
|
||||
@ -5511,7 +5511,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(CONST_DATA_PTR)
|
||||
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
|
||||
NODE_NAME_CASE(LDS)
|
||||
NODE_NAME_CASE(FPTRUNC_ROUND)
|
||||
NODE_NAME_CASE(DUMMY_CHAIN)
|
||||
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
|
||||
NODE_NAME_CASE(LOAD_D16_HI)
|
||||
|
||||
@ -553,7 +553,6 @@ enum NodeType : unsigned {
|
||||
CONST_DATA_PTR,
|
||||
PC_ADD_REL_OFFSET,
|
||||
LDS,
|
||||
FPTRUNC_ROUND,
|
||||
|
||||
DUMMY_CHAIN,
|
||||
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
|
||||
@ -5594,6 +5594,16 @@ void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
|
||||
MIB.addImm(ExpVal);
|
||||
}
|
||||
|
||||
void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
|
||||
const MachineInstr &MI,
|
||||
int OpIdx) const {
|
||||
// "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
|
||||
// "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
|
||||
// "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
|
||||
// "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
|
||||
MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
|
||||
}
|
||||
|
||||
bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
|
||||
return TII.isInlineConstant(Imm);
|
||||
}
|
||||
|
||||
@ -359,6 +359,9 @@ private:
|
||||
void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI,
|
||||
int OpIdx) const;
|
||||
|
||||
void renderRoundMode(MachineInstrBuilder &MIB, const MachineInstr &MI,
|
||||
int OpIdx) const;
|
||||
|
||||
bool isInlineImmediate(const APInt &Imm) const;
|
||||
bool isInlineImmediate(const APFloat &Imm) const;
|
||||
|
||||
|
||||
@ -1137,7 +1137,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
|
||||
.lower();
|
||||
|
||||
getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
|
||||
.customFor({S16, S32})
|
||||
.legalFor({S16, S32})
|
||||
.scalarize(0)
|
||||
.lower();
|
||||
|
||||
@ -2179,8 +2179,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
|
||||
return legalizeCTLZ_CTTZ(MI, MRI, B);
|
||||
case TargetOpcode::G_CTLZ_ZERO_UNDEF:
|
||||
return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
|
||||
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
|
||||
return legalizeFPTruncRound(MI, B);
|
||||
case TargetOpcode::G_STACKSAVE:
|
||||
return legalizeStackSave(MI, B);
|
||||
case TargetOpcode::G_GET_FPENV:
|
||||
@ -7093,35 +7091,6 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
|
||||
MachineIRBuilder &B) const {
|
||||
MachineRegisterInfo &MRI = *B.getMRI();
|
||||
Register Src = MI.getOperand(1).getReg();
|
||||
if (MRI.getType(Src) != LLT::scalar(32))
|
||||
return false;
|
||||
|
||||
// Only support towardzero, tonearest, upward and downward.
|
||||
int RoundMode = MI.getOperand(2).getImm();
|
||||
if (RoundMode != (int)RoundingMode::TowardZero &&
|
||||
RoundMode != (int)RoundingMode::NearestTiesToEven &&
|
||||
RoundMode != (int)RoundingMode::TowardPositive &&
|
||||
RoundMode != (int)RoundingMode::TowardNegative)
|
||||
return false;
|
||||
|
||||
// "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
|
||||
// "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
|
||||
// "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
|
||||
// "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
|
||||
unsigned HW_Mode = (RoundMode + 3) % 4;
|
||||
B.buildInstr(AMDGPU::G_FPTRUNC_ROUND)
|
||||
.addDef(MI.getOperand(0).getReg())
|
||||
.addUse(Src)
|
||||
.addImm(HW_Mode);
|
||||
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
|
||||
MachineIRBuilder &B) const {
|
||||
const SITargetLowering *TLI = ST.getTargetLowering();
|
||||
|
||||
@ -212,7 +212,6 @@ public:
|
||||
|
||||
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
|
||||
|
||||
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
|
||||
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
|
||||
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const;
|
||||
|
||||
|
||||
@ -5255,7 +5255,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
|
||||
break;
|
||||
}
|
||||
case AMDGPU::G_FPTRUNC_ROUND:
|
||||
case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
|
||||
return getDefaultMappingVOP(MI);
|
||||
case AMDGPU::G_PREFETCH:
|
||||
OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
|
||||
|
||||
@ -598,7 +598,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
|
||||
// F16 - VOP1 Actions.
|
||||
setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
|
||||
ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
|
||||
ISD::FSIN, ISD::FROUND},
|
||||
MVT::f16, Custom);
|
||||
|
||||
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
|
||||
@ -5797,8 +5797,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::FP_ROUND:
|
||||
case ISD::STRICT_FP_ROUND:
|
||||
return lowerFP_ROUND(Op, DAG);
|
||||
case ISD::FPTRUNC_ROUND:
|
||||
return lowerFPTRUNC_ROUND(Op, DAG);
|
||||
case ISD::TRAP:
|
||||
return lowerTRAP(Op, DAG);
|
||||
case ISD::DEBUGTRAP:
|
||||
@ -6648,30 +6646,6 @@ SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
|
||||
DAG.getTargetConstant(0, DL, MVT::i32));
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerFPTRUNC_ROUND(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
if (Op.getOperand(0)->getValueType(0) != MVT::f32)
|
||||
return SDValue();
|
||||
|
||||
// Only support towardzero, tonearest, upward and downward.
|
||||
int RoundMode = Op.getConstantOperandVal(1);
|
||||
if (RoundMode != (int)RoundingMode::TowardZero &&
|
||||
RoundMode != (int)RoundingMode::NearestTiesToEven &&
|
||||
RoundMode != (int)RoundingMode::TowardPositive &&
|
||||
RoundMode != (int)RoundingMode::TowardNegative)
|
||||
return SDValue();
|
||||
|
||||
// "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
|
||||
// "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
|
||||
// "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
|
||||
// "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
|
||||
unsigned HW_Mode = (RoundMode + 3) % 4;
|
||||
SDLoc DL(Op);
|
||||
SDValue RoundFlag = DAG.getTargetConstant(HW_Mode, DL, MVT::i32);
|
||||
return DAG.getNode(AMDGPUISD::FPTRUNC_ROUND, DL, Op.getNode()->getVTList(),
|
||||
Op->getOperand(0), RoundFlag);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
|
||||
assert(Op.getValueType() == MVT::f16 &&
|
||||
"Do not know how to custom lower FP_ROUND for non-f16 type");
|
||||
|
||||
@ -145,7 +145,6 @@ private:
|
||||
|
||||
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
|
||||
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFPTRUNC_ROUND(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerMUL(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
@ -304,12 +304,6 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
|
||||
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
|
||||
>;
|
||||
|
||||
def SDTFPRoundModeOp : SDTypeProfile<1, 2, [
|
||||
SDTCisFP<0>, SDTCisFP<1>, SDTCisInt<2>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
|
||||
]>;
|
||||
|
||||
def SIfptrunc_round : SDNode<"AMDGPUISD::FPTRUNC_ROUND", SDTFPRoundModeOp>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ValueType helpers
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -796,6 +790,22 @@ return CurDAG->getTargetConstant(
|
||||
N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
|
||||
}]>;
|
||||
|
||||
def as_hw_round_mode : SDNodeXForm<timm, [{
|
||||
// "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
|
||||
// "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
|
||||
// "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
|
||||
// "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
|
||||
return CurDAG->getTargetConstant((N->getSExtValue() + 3) % 4, SDLoc(N),
|
||||
MVT::i32);
|
||||
}]>;
|
||||
|
||||
def SupportedRoundMode : TImmLeaf<i32, [{
|
||||
return Imm == (int)RoundingMode::TowardZero ||
|
||||
Imm == (int)RoundingMode::NearestTiesToEven ||
|
||||
Imm == (int)RoundingMode::TowardPositive ||
|
||||
Imm == (int)RoundingMode::TowardNegative;
|
||||
}]>;
|
||||
|
||||
class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
|
||||
uint64_t Imm = N->getZExtValue();
|
||||
unsigned Bit = (Imm >> }] # bitnum # [{ ) & 1;
|
||||
|
||||
@ -229,10 +229,12 @@ def S_INVERSE_BALLOT_U64 : SPseudoInstSI<
|
||||
// in the ModeRegister pass.
|
||||
let Uses = [MODE, EXEC] in {
|
||||
def FPTRUNC_ROUND_F16_F32_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
|
||||
(ins VGPR_32:$src0, i32imm:$round),
|
||||
[(set f16:$vdst, (SIfptrunc_round f32:$src0, i32:$round))]>;
|
||||
(ins VGPR_32:$src0, i32imm:$round)>;
|
||||
} // End Uses = [MODE, EXEC]
|
||||
|
||||
def : GCNPat <(f16 (fptrunc_round f32:$src0, (i32 SupportedRoundMode:$round))),
|
||||
(FPTRUNC_ROUND_F16_F32_PSEUDO $src0, (as_hw_round_mode $round))>;
|
||||
|
||||
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
|
||||
// restoring it after we're done.
|
||||
let Defs = [SCC], isConvergent = 1 in {
|
||||
@ -4055,11 +4057,6 @@ def G_SI_CALL : AMDGPUGenericInstruction {
|
||||
let isConvergent = 1;
|
||||
}
|
||||
|
||||
def G_FPTRUNC_ROUND : AMDGPUGenericInstruction {
|
||||
let OutOperandList = (outs type0:$vdst);
|
||||
let InOperandList = (ins type1:$src0, untyped_imm_0:$round);
|
||||
let hasSideEffects = 0;
|
||||
}
|
||||
|
||||
//============================================================================//
|
||||
// Dummy Instructions
|
||||
|
||||
@ -1,9 +1,8 @@
|
||||
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefixes=SDAG-FAIL
|
||||
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=GISEL-FAIL
|
||||
; RUN: not --crash llc -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
|
||||
; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx1030 -o /dev/null %s 2>&1 | FileCheck %s --ignore-case --check-prefix=FAIL
|
||||
|
||||
define amdgpu_gs void @test_fptrunc_round_f64(double %a, ptr addrspace(1) %out) {
|
||||
; SDAG-FAIL: LLVM ERROR: Cannot select
|
||||
; GISEL-FAIL: unable to legalize instruction
|
||||
; FAIL: LLVM ERROR: Cannot select
|
||||
%res = call half @llvm.fptrunc.round.f16.f64(double %a, metadata !"round.upward")
|
||||
store half %res, ptr addrspace(1) %out, align 4
|
||||
ret void
|
||||
|
||||
@ -176,8 +176,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_upward(<2 x float> %
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
|
||||
ret <2 x half> %res
|
||||
@ -197,8 +196,7 @@ define amdgpu_gs <2 x half> @v_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.downward")
|
||||
ret <2 x half> %res
|
||||
@ -228,23 +226,18 @@ define amdgpu_gs void @v_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v3
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v1, v7, 16, v6
|
||||
; GISEL-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v3
|
||||
; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7
|
||||
; GISEL-NEXT: v_pack_b32_f16 v1, v1, v2
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
|
||||
; GISEL-NEXT: v_pk_add_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_pk_add_f16 v0, v2, v0
|
||||
; GISEL-NEXT: v_pk_add_f16 v0, v0, v3
|
||||
; GISEL-NEXT: v_pk_add_f16 v0, v1, v0
|
||||
; GISEL-NEXT: global_store_dword v[4:5], v0, off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
|
||||
@ -295,31 +288,54 @@ define amdgpu_gs <2 x i32> @s_fptrunc_round_v2f32_to_v2f16_downward(<2 x float>
|
||||
}
|
||||
|
||||
define amdgpu_gs void @s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls(<2 x float> inreg %a, <2 x float> inreg %b, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
|
||||
; CHECK: ; %bb.0:
|
||||
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
||||
; CHECK-NEXT: v_mov_b32_e32 v3, s2
|
||||
; CHECK-NEXT: v_mov_b32_e32 v4, s1
|
||||
; CHECK-NEXT: v_mov_b32_e32 v5, s3
|
||||
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; CHECK-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; CHECK-NEXT: v_cvt_f16_f32_e32 v6, v3
|
||||
; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; CHECK-NEXT: v_cvt_f16_f32_e32 v7, v5
|
||||
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
|
||||
; CHECK-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; CHECK-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; CHECK-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; CHECK-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; CHECK-NEXT: v_lshl_or_b32 v2, v4, 16, v2
|
||||
; CHECK-NEXT: v_cvt_f16_f32_e32 v4, v5
|
||||
; CHECK-NEXT: v_lshl_or_b32 v5, v7, 16, v6
|
||||
; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; CHECK-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
|
||||
; CHECK-NEXT: v_pk_add_f16 v2, v2, v5
|
||||
; CHECK-NEXT: v_pk_add_f16 v2, v3, v2
|
||||
; CHECK-NEXT: global_store_dword v[0:1], v2, off
|
||||
; CHECK-NEXT: s_endpgm
|
||||
; SDAG-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
|
||||
; SDAG: ; %bb.0:
|
||||
; SDAG-NEXT: v_mov_b32_e32 v2, s0
|
||||
; SDAG-NEXT: v_mov_b32_e32 v3, s2
|
||||
; SDAG-NEXT: v_mov_b32_e32 v4, s1
|
||||
; SDAG-NEXT: v_mov_b32_e32 v5, s3
|
||||
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; SDAG-NEXT: v_cvt_f16_f32_e32 v6, v3
|
||||
; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; SDAG-NEXT: v_cvt_f16_f32_e32 v7, v5
|
||||
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
|
||||
; SDAG-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; SDAG-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
|
||||
; SDAG-NEXT: v_lshl_or_b32 v2, v4, 16, v2
|
||||
; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v5
|
||||
; SDAG-NEXT: v_lshl_or_b32 v5, v7, 16, v6
|
||||
; SDAG-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
|
||||
; SDAG-NEXT: v_pk_add_f16 v2, v2, v5
|
||||
; SDAG-NEXT: v_pk_add_f16 v2, v3, v2
|
||||
; SDAG-NEXT: global_store_dword v[0:1], v2, off
|
||||
; SDAG-NEXT: s_endpgm
|
||||
;
|
||||
; GISEL-LABEL: s_fptrunc_round_v2f32_to_v2f16_upward_multiple_calls:
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GISEL-NEXT: v_mov_b32_e32 v3, s1
|
||||
; GISEL-NEXT: v_mov_b32_e32 v4, s2
|
||||
; GISEL-NEXT: v_mov_b32_e32 v5, s3
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v4
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v5
|
||||
; GISEL-NEXT: v_pack_b32_f16 v2, v2, v3
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v4
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v5
|
||||
; GISEL-NEXT: v_pack_b32_f16 v5, v6, v7
|
||||
; GISEL-NEXT: v_pack_b32_f16 v3, v3, v4
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 0
|
||||
; GISEL-NEXT: v_pk_add_f16 v2, v2, v5
|
||||
; GISEL-NEXT: v_pk_add_f16 v2, v3, v2
|
||||
; GISEL-NEXT: global_store_dword v[0:1], v2, off
|
||||
; GISEL-NEXT: s_endpgm
|
||||
%res1 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %a, metadata !"round.upward")
|
||||
%res2 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.upward")
|
||||
%res3 = call <2 x half> @llvm.fptrunc.round.v2f16.v2f32(<2 x float> %b, metadata !"round.downward")
|
||||
@ -344,8 +360,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_upward(<3 x float> %
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.upward")
|
||||
@ -367,8 +382,7 @@ define amdgpu_gs <3 x half> @v_fptrunc_round_v3f32_to_v3f16_downward(<3 x float>
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v2
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <3 x half> @llvm.fptrunc.round.v3f16.v3f32(<3 x float> %a, metadata !"round.downward")
|
||||
@ -391,13 +405,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_upward(<4 x float> %
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.upward")
|
||||
ret <4 x half> %res
|
||||
@ -419,13 +431,11 @@ define amdgpu_gs <4 x half> @v_fptrunc_round_v4f32_to_v4f16_downward(<4 x float>
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <4 x half> @llvm.fptrunc.round.v4f16.v4f32(<4 x float> %a, metadata !"round.downward")
|
||||
ret <4 x half> %res
|
||||
@ -453,21 +463,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_upward(<8 x float> %
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
|
||||
; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
|
||||
; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5
|
||||
; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.upward")
|
||||
ret <8 x half> %res
|
||||
@ -495,21 +501,17 @@ define amdgpu_gs <8 x half> @v_fptrunc_round_v8f32_to_v8f16_downward(<8 x float>
|
||||
; GISEL: ; %bb.0:
|
||||
; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 3, 1), 1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v5, v5
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v6, v6
|
||||
; GISEL-NEXT: v_cvt_f16_f32_e32 v7, v7
|
||||
; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GISEL-NEXT: v_and_b32_e32 v4, 0xffff, v4
|
||||
; GISEL-NEXT: v_and_b32_e32 v6, 0xffff, v6
|
||||
; GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GISEL-NEXT: v_lshl_or_b32 v1, v3, 16, v2
|
||||
; GISEL-NEXT: v_lshl_or_b32 v2, v5, 16, v4
|
||||
; GISEL-NEXT: v_lshl_or_b32 v3, v7, 16, v6
|
||||
; GISEL-NEXT: v_pack_b32_f16 v0, v0, v1
|
||||
; GISEL-NEXT: v_pack_b32_f16 v1, v2, v3
|
||||
; GISEL-NEXT: v_pack_b32_f16 v2, v4, v5
|
||||
; GISEL-NEXT: v_pack_b32_f16 v3, v6, v7
|
||||
; GISEL-NEXT: ; return to shader part epilog
|
||||
%res = call <8 x half> @llvm.fptrunc.round.v8f16.v8f32(<8 x float> %a, metadata !"round.downward")
|
||||
ret <8 x half> %res
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user