[SelectionDAG] Handle fneg/fabs/fcopysign in SimplifyDemandedBits (#139239)

This commit is contained in:
Iris Shi 2025-06-22 22:48:59 +08:00 committed by GitHub
parent 078475d6c1
commit f2eb5d416e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 268 additions and 271 deletions

View File

@ -18403,49 +18403,12 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, DL, VT, {N0, N1}))
return C;
if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
const APFloat &V = N1C->getValueAPF();
// copysign(x, c1) -> fabs(x) iff ispos(c1)
// copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
if (!V.isNegative()) {
if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
return DAG.getNode(ISD::FABS, DL, VT, N0);
} else {
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
return DAG.getNode(ISD::FNEG, DL, VT,
DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
}
}
// copysign(fabs(x), y) -> copysign(x, y)
// copysign(fneg(x), y) -> copysign(x, y)
// copysign(copysign(x,z), y) -> copysign(x, y)
if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
N0.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0.getOperand(0), N1);
// copysign(x, abs(y)) -> abs(x)
if (N1.getOpcode() == ISD::FABS)
return DAG.getNode(ISD::FABS, DL, VT, N0);
// copysign(x, copysign(y,z)) -> copysign(x, z)
if (N1.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(1));
// copysign(x, fp_extend(y)) -> copysign(x, y)
// copysign(x, fp_round(y)) -> copysign(x, y)
if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N0, N1.getOperand(0));
// We only take the sign bit from the sign operand.
EVT SignVT = N1.getValueType();
if (SimplifyDemandedBits(N1,
APInt::getSignMask(SignVT.getScalarSizeInBits())))
return SDValue(N, 0);
// We only take the non-sign bits from the value operand
if (SimplifyDemandedBits(N0,
APInt::getSignedMaxValue(VT.getScalarSizeInBits())))
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
return SDValue();
@ -18972,6 +18935,9 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
N0.getOperand(0));
}
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;
@ -19045,14 +19011,8 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
if (SDValue C = DAG.FoldConstantArithmetic(ISD::FABS, DL, VT, {N0}))
return C;
// fold (fabs (fabs x)) -> (fabs x)
if (N0.getOpcode() == ISD::FABS)
return N->getOperand(0);
// fold (fabs (fneg x)) -> (fabs x)
// fold (fabs (fcopysign x, y)) -> (fabs x)
if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
return DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
if (SimplifyDemandedBits(SDValue(N, 0)))
return SDValue(N, 0);
if (SDValue Cast = foldSignChangeInBitcast(N))
return Cast;

View File

@ -2967,6 +2967,77 @@ bool TargetLowering::SimplifyDemandedBits(
}
break;
}
case ISD::FABS: {
SDValue Op0 = Op.getOperand(0);
APInt SignMask = APInt::getSignMask(BitWidth);
if (!DemandedBits.intersects(SignMask))
return TLO.CombineTo(Op, Op0);
if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
Depth + 1))
return true;
if (Known.isNonNegative())
return TLO.CombineTo(Op, Op0);
if (Known.isNegative())
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::FNEG, dl, VT, Op0, Op->getFlags()));
Known.Zero |= SignMask;
Known.One &= ~SignMask;
break;
}
case ISD::FCOPYSIGN: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
unsigned BitWidth0 = Op0.getScalarValueSizeInBits();
unsigned BitWidth1 = Op1.getScalarValueSizeInBits();
APInt SignMask0 = APInt::getSignMask(BitWidth0);
APInt SignMask1 = APInt::getSignMask(BitWidth1);
if (!DemandedBits.intersects(SignMask0))
return TLO.CombineTo(Op, Op0);
if (SimplifyDemandedBits(Op0, ~SignMask0 & DemandedBits, DemandedElts,
Known, TLO, Depth + 1) ||
SimplifyDemandedBits(Op1, SignMask1, DemandedElts, Known2, TLO,
Depth + 1))
return true;
if (Known2.isNonNegative())
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::FABS, dl, VT, Op0, Op->getFlags()));
if (Known2.isNegative())
return TLO.CombineTo(
Op, TLO.DAG.getNode(ISD::FNEG, dl, VT,
TLO.DAG.getNode(ISD::FABS, SDLoc(Op0), VT, Op0)));
Known.Zero &= ~SignMask0;
Known.One &= ~SignMask0;
break;
}
case ISD::FNEG: {
SDValue Op0 = Op.getOperand(0);
APInt SignMask = APInt::getSignMask(BitWidth);
if (!DemandedBits.intersects(SignMask))
return TLO.CombineTo(Op, Op0);
if (SimplifyDemandedBits(Op0, DemandedBits, DemandedElts, Known, TLO,
Depth + 1))
return true;
if (!Known.isSignUnknown()) {
Known.Zero ^= SignMask;
Known.One ^= SignMask;
}
break;
}
default:
// We also ask the target about intrinsics (which could be specific to it).
if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||

View File

@ -391,13 +391,10 @@ define float @extract_v4i32_copysign_build_vector(<4 x float> %a, <4 x float> %b
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sub sp, sp, #16
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
; CHECK-SD-NEXT: adrp x8, .LCPI16_0
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI16_0]
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
; CHECK-SD-NEXT: mov x8, sp
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: bfi x8, x0, #2, #2
; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: str q0, [sp]
; CHECK-SD-NEXT: ldr s0, [x8]
; CHECK-SD-NEXT: add sp, sp, #16
@ -425,10 +422,7 @@ entry:
define float @extract_v4i32_copysign_build_vector_const(<4 x float> %a, <4 x float> %b, i32 %c) {
; CHECK-SD-LABEL: extract_v4i32_copysign_build_vector_const:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: adrp x8, .LCPI17_0
; CHECK-SD-NEXT: mvni v1.4s, #128, lsl #24
; CHECK-SD-NEXT: ldr q2, [x8, :lo12:.LCPI17_0]
; CHECK-SD-NEXT: bif v0.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: fabs v0.4s, v0.4s
; CHECK-SD-NEXT: mov s0, v0.s[2]
; CHECK-SD-NEXT: ret
;

View File

@ -4388,12 +4388,11 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> %m
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfe_u32 v4, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v1
; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fff, v4
; GFX8-NEXT: v_or_b32_e32 v3, 0x400000, v1
; GFX8-NEXT: v_bfe_u32 v3, v1, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX8-NEXT: v_bfe_u32 v3, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v3, vcc, v3, v2
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fff, v3
@ -5267,13 +5266,12 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32(<2 x bfloat> i
;
; GFX8-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x10010
; GFX8-NEXT: s_add_i32 s4, s4, s1
; GFX8-NEXT: s_or_b32 s3, s1, 0x400000
; GFX8-NEXT: s_add_i32 s6, s4, 0x7fff
; GFX8-NEXT: s_bfe_u32 s3, s1, 0x10010
; GFX8-NEXT: s_add_i32 s3, s3, s1
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], s1, s1
; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], exec
; GFX8-NEXT: s_cselect_b32 s1, s3, s6
; GFX8-NEXT: s_cselect_b32 s1, s1, s3
; GFX8-NEXT: s_bfe_u32 s3, s2, 0x10010
; GFX8-NEXT: s_add_i32 s3, s3, s2
; GFX8-NEXT: s_addk_i32 s3, 0x7fff
@ -6340,18 +6338,16 @@ define <3 x bfloat> @v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32(<3 x bfloat> %m
; GFX8-LABEL: v_copysign_out_v3bf16_mag_v3bf16_sign_v3f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_or_b32_e32 v5, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v5, vcc
; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
; GFX8-NEXT: v_add_u32_e32 v5, vcc, s4, v5
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; GFX8-NEXT: v_bfe_u32 v5, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v2
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v3
; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
@ -7687,24 +7683,22 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32(<4 x bfloat> %m
; GFX8-LABEL: v_copysign_out_v4bf16_mag_v4bf16_sign_v4f32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfe_u32 v7, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v4
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4
; GFX8-NEXT: v_bfe_u32 v6, v4, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v4
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc
; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
; GFX8-NEXT: v_add_u32_e32 v7, vcc, s4, v7
; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v2
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc
; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
; GFX8-NEXT: s_movk_i32 s4, 0x7fff
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
; GFX8-NEXT: v_add_u32_e32 v6, vcc, s4, v6
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6

View File

@ -3227,40 +3227,38 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag,
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v5, v1, 20, 11
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v5
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v1
; VI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; VI-NEXT: v_med3_i32 v8, v8, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v9, v8, v3
; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v9
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v3
; VI-NEXT: v_med3_i32 v5, v5, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v3
; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5
; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v5
; VI-NEXT: v_or_b32_e32 v3, v9, v3
; VI-NEXT: v_or_b32_e32 v8, v0, v8
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; VI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
; VI-NEXT: v_and_b32_e32 v8, 7, v3
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8
; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8
; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1
; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v1
; VI-NEXT: v_or_b32_e32 v3, v8, v3
; VI-NEXT: v_or_b32_e32 v5, v0, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-NEXT: v_and_b32_e32 v5, 7, v3
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v8, v8, v9
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v5, v5, v8
; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v8
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; VI-NEXT: v_mov_b32_e32 v3, 0x8000
; VI-NEXT: v_and_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v4
; VI-NEXT: s_setpc_b64 s[30:31]
@ -4050,41 +4048,38 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: s_bfe_u32 s3, s1, 0xb0014
; VI-NEXT: s_sub_i32 s3, 0x3f1, s1
; VI-NEXT: s_or_b32 s0, s7, s0
; VI-NEXT: s_sub_i32 s7, 0x3f1, s3
; VI-NEXT: v_med3_i32 v0, s7, 0, 13
; VI-NEXT: v_med3_i32 v0, s3, 0, 13
; VI-NEXT: s_or_b32 s2, s0, 0x1000
; VI-NEXT: v_readfirstlane_b32 s7, v0
; VI-NEXT: s_lshr_b32 s8, s2, s7
; VI-NEXT: s_lshl_b32 s7, s8, s7
; VI-NEXT: s_cmp_lg_u32 s7, s2
; VI-NEXT: v_readfirstlane_b32 s3, v0
; VI-NEXT: s_lshr_b32 s7, s2, s3
; VI-NEXT: s_lshl_b32 s3, s7, s3
; VI-NEXT: s_cmp_lg_u32 s3, s2
; VI-NEXT: s_cselect_b32 s2, 1, 0
; VI-NEXT: s_addk_i32 s3, 0xfc10
; VI-NEXT: s_lshl_b32 s7, s3, 12
; VI-NEXT: s_or_b32 s2, s8, s2
; VI-NEXT: s_or_b32 s7, s0, s7
; VI-NEXT: s_cmp_lt_i32 s3, 1
; VI-NEXT: s_cselect_b32 s2, s2, s7
; VI-NEXT: s_and_b32 s7, s2, 7
; VI-NEXT: s_cmp_gt_i32 s7, 5
; VI-NEXT: s_cselect_b32 s8, 1, 0
; VI-NEXT: s_cmp_eq_u32 s7, 3
; VI-NEXT: s_addk_i32 s1, 0xfc10
; VI-NEXT: s_lshl_b32 s3, s1, 12
; VI-NEXT: s_or_b32 s2, s7, s2
; VI-NEXT: s_or_b32 s3, s0, s3
; VI-NEXT: s_cmp_lt_i32 s1, 1
; VI-NEXT: s_cselect_b32 s2, s2, s3
; VI-NEXT: s_and_b32 s3, s2, 7
; VI-NEXT: s_cmp_gt_i32 s3, 5
; VI-NEXT: s_cselect_b32 s7, 1, 0
; VI-NEXT: s_or_b32 s7, s7, s8
; VI-NEXT: s_cmp_eq_u32 s3, 3
; VI-NEXT: s_cselect_b32 s3, 1, 0
; VI-NEXT: s_or_b32 s3, s3, s7
; VI-NEXT: s_lshr_b32 s2, s2, 2
; VI-NEXT: s_add_i32 s2, s2, s7
; VI-NEXT: s_cmp_lt_i32 s3, 31
; VI-NEXT: s_add_i32 s2, s2, s3
; VI-NEXT: s_cmp_lt_i32 s1, 31
; VI-NEXT: s_cselect_b32 s2, s2, 0x7c00
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b32 s0, s6, 0x7c00
; VI-NEXT: s_cmpk_eq_i32 s3, 0x40f
; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f
; VI-NEXT: s_cselect_b32 s0, s0, s2
; VI-NEXT: s_lshr_b32 s1, s1, 16
; VI-NEXT: s_and_b32 s1, s1, 0x8000
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_and_b32 s0, s0, 0x7fff
; VI-NEXT: s_or_b32 s0, s0, s5
; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@ -4918,40 +4913,37 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v8, v1, 20, 11
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v8
; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v1
; VI-NEXT: v_or_b32_e32 v5, 0x1000, v0
; VI-NEXT: v_med3_i32 v11, v11, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v5
; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v5
; VI-NEXT: v_med3_i32 v8, v8, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v11, v8, v5
; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v11
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v8, vcc, s5, v8
; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v8
; VI-NEXT: v_or_b32_e32 v5, v12, v5
; VI-NEXT: v_or_b32_e32 v11, v0, v11
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v8
; VI-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc
; VI-NEXT: v_and_b32_e32 v11, 7, v5
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1
; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v1
; VI-NEXT: v_or_b32_e32 v5, v11, v5
; VI-NEXT: v_or_b32_e32 v8, v0, v8
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
; VI-NEXT: v_and_b32_e32 v8, 7, v5
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8
; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v11, v11, v12
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v8, v8, v11
; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v11
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v8
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; VI-NEXT: v_mov_b32_e32 v5, 0x8000
; VI-NEXT: v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v3
; VI-NEXT: v_or_b32_e32 v2, v5, v2
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffe, v1
@ -4986,7 +4978,8 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v6
; VI-NEXT: v_bfi_b32 v1, s4, v4, v7
@ -6061,76 +6054,73 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: v_and_b32_e32 v10, 0xffe, v10
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v11, v5, 20, 11
; VI-NEXT: v_bfe_u32 v5, v5, 20, 11
; VI-NEXT: s_movk_i32 s4, 0x3f1
; VI-NEXT: v_or_b32_e32 v4, v10, v4
; VI-NEXT: v_sub_u32_e32 v12, vcc, s4, v11
; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v5
; VI-NEXT: v_or_b32_e32 v10, 0x1000, v4
; VI-NEXT: v_med3_i32 v12, v12, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v13, v12, v10
; VI-NEXT: v_lshlrev_b32_e32 v12, v12, v13
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v10
; VI-NEXT: v_med3_i32 v11, v11, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v10
; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v10
; VI-NEXT: s_movk_i32 s5, 0xfc10
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v11, vcc, s5, v11
; VI-NEXT: v_lshlrev_b32_e32 v12, 12, v11
; VI-NEXT: v_or_b32_e32 v10, v13, v10
; VI-NEXT: v_or_b32_e32 v12, v4, v12
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v11
; VI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
; VI-NEXT: v_and_b32_e32 v12, 7, v10
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12
; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5
; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v5
; VI-NEXT: v_or_b32_e32 v10, v12, v10
; VI-NEXT: v_or_b32_e32 v11, v4, v11
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
; VI-NEXT: v_and_b32_e32 v11, 7, v10
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v12, v12, v13
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v11, v11, v12
; VI-NEXT: v_lshrrev_b32_e32 v10, 2, v10
; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v12
; VI-NEXT: v_mov_b32_e32 v12, 0x7c00
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v11
; VI-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
; VI-NEXT: v_mov_b32_e32 v13, 0x7e00
; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; VI-NEXT: v_mov_b32_e32 v11, 0x7c00
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
; VI-NEXT: v_mov_b32_e32 v12, 0x7e00
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: s_movk_i32 s6, 0x40f
; VI-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11
; VI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; VI-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
; VI-NEXT: v_mov_b32_e32 v10, 0x8000
; VI-NEXT: v_and_b32_e32 v11, 0x1ff, v7
; VI-NEXT: v_and_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v6, v11, v6
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: v_and_b32_e32 v10, 0x1ff, v7
; VI-NEXT: v_or_b32_e32 v6, v10, v6
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v7
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v7, v7, 20, 11
; VI-NEXT: v_or_b32_e32 v5, v5, v6
; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v7
; VI-NEXT: v_sub_u32_e32 v10, vcc, s4, v7
; VI-NEXT: v_or_b32_e32 v6, 0x1000, v5
; VI-NEXT: v_med3_i32 v11, v11, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v14, v11, v6
; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v14
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6
; VI-NEXT: v_med3_i32 v10, v10, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v13, v10, v6
; VI-NEXT: v_lshlrev_b32_e32 v10, v10, v13
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7
; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v7
; VI-NEXT: v_or_b32_e32 v6, v14, v6
; VI-NEXT: v_or_b32_e32 v11, v5, v11
; VI-NEXT: v_lshlrev_b32_e32 v10, 12, v7
; VI-NEXT: v_or_b32_e32 v6, v13, v6
; VI-NEXT: v_or_b32_e32 v10, v5, v10
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7
; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; VI-NEXT: v_and_b32_e32 v11, 7, v6
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; VI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v11, v11, v14
; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
; VI-NEXT: v_and_b32_e32 v10, 7, v6
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10
; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v10, v10, v13
; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v11
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v10
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7
; VI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc
; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc
; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7
; VI-NEXT: v_and_b32_e32 v7, 0x1ff, v1
; VI-NEXT: v_or_b32_e32 v0, v7, v0
@ -6139,39 +6129,37 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_and_b32_e32 v6, 0xffe, v6
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v7, v1, 20, 11
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v6, v0
; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v7
; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v1
; VI-NEXT: v_or_b32_e32 v6, 0x1000, v0
; VI-NEXT: v_med3_i32 v11, v11, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v14, v11, v6
; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v14
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6
; VI-NEXT: v_med3_i32 v7, v7, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v10, v7, v6
; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v10
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7
; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v7
; VI-NEXT: v_or_b32_e32 v6, v14, v6
; VI-NEXT: v_or_b32_e32 v11, v0, v11
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7
; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; VI-NEXT: v_and_b32_e32 v11, 7, v6
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; VI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v11, v11, v14
; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1
; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v1
; VI-NEXT: v_or_b32_e32 v6, v10, v6
; VI-NEXT: v_or_b32_e32 v7, v0, v7
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; VI-NEXT: v_and_b32_e32 v7, 7, v6
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v7, v7, v10
; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v11
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7
; VI-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v7
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7
; VI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; VI-NEXT: v_and_b32_e32 v6, 0x1ff, v3
; VI-NEXT: v_and_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v6, v2
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffe, v1
@ -6200,16 +6188,18 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v12, v13, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; VI-NEXT: v_and_b32_e32 v2, 0x7fff, v4
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: v_bfi_b32 v0, s4, v0, v8
; VI-NEXT: v_bfi_b32 v1, s4, v1, v9
; VI-NEXT: s_setpc_b64 s[30:31]

View File

@ -452,7 +452,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_add_f32_e64 v0, |v0|, v1
; CI-NEXT: v_sub_f32_e32 v0, v1, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negk_negk_f16:
@ -462,7 +462,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; VI-NEXT: v_mov_b32_e32 v3, 0xc000
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_add_f16_e64 v0, |v0|, v1
; VI-NEXT: v_sub_f16_e32 v0, v1, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_f16:
@ -472,7 +472,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v1.l
; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_f16:
@ -482,7 +482,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v1
; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_f16:
@ -492,7 +492,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo
; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v1.l
; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_f16:
@ -502,7 +502,7 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo
; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v1
; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0
; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq i32 %c, 0
%select = select i1 %cmp, half -2.0, half -1.0

View File

@ -132,12 +132,11 @@ define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
ret void
}
; FIXME: fabs should fold away
; GCN-LABEL: {{^}}add_select_fabs_negk_negk_f32:
; GCN: buffer_load_dword [[X:v[0-9]+]]
; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
; GCN: v_sub_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
%x = load volatile float, ptr addrspace(1) poison
%cmp = icmp eq i32 %c, 0

View File

@ -776,20 +776,16 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; CI-LABEL: add_select_fabs_negk_negk_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; CI-NEXT: v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_add_f32_e32 v0, v0, v2
; CI-NEXT: v_add_f32_e32 v1, v1, v3
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
; CI-NEXT: v_sub_f32_e32 v1, v3, v1
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negk_negk_v2f16:
@ -801,8 +797,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; VI-NEXT: v_add_f16_sdwa v1, |v1|, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_add_f16_e64 v0, |v0|, v2
; VI-NEXT: v_sub_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_sub_f16_e32 v0, v2, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
@ -816,8 +812,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-NEXT: v_pk_add_f16 v0, v0, v2
; GFX9-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16:
@ -831,9 +826,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2
; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16:
@ -846,9 +839,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2
; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16:
@ -862,9 +854,7 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l
; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2
; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16:
@ -877,9 +867,8 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x)
; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo
; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2
; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1]
; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31]
%cmp = icmp eq <2 x i32> %c, zeroinitializer
%select = select <2 x i1> %cmp, <2 x half> <half -2.0, half -2.0>, <2 x half> <half -1.0, half -1.0>

View File

@ -1515,7 +1515,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; EG-NEXT: TRUNC * T0.W, PV.W,
; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
; EG-NEXT: TRUNC * T0.W, PV.W,
; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|,
; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y,
; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.X, T0.W,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
@ -1658,7 +1658,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: TRUNC * T0.W, PV.W,
; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.Y, T0.X,
; EG-NEXT: TRUNC * T0.W, PV.W,
; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.Y|,
; EG-NEXT: SETGE * T1.W, |PV.W|, T0.Y,
; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.X, T0.W,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)
@ -1858,7 +1858,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; EG-NEXT: TRUNC * T0.W, PV.W,
; EG-NEXT: MULADD_IEEE T1.W, -PV.W, T0.X, T0.Z,
; EG-NEXT: TRUNC * T0.W, PV.W,
; EG-NEXT: SETGE * T1.W, |PV.W|, |T0.X|,
; EG-NEXT: SETGE * T1.W, |PV.W|, T0.X,
; EG-NEXT: CNDE T1.W, PV.W, 0.0, literal.x,
; EG-NEXT: FLT_TO_UINT * T0.X, T0.W,
; EG-NEXT: 1(1.401298e-45), 0(0.000000e+00)