AMDGPU: Handle sitofp and uitofp exponents in fast pow expansion

https://reviews.llvm.org/D158996
This commit is contained in:
Matt Arsenault 2023-08-26 10:25:21 -04:00
parent 699685b718
commit dac8f974b5
3 changed files with 361 additions and 30 deletions

View File

@ -567,7 +567,8 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
return true;
}
static bool isKnownIntegral(const Value *V) {
static bool isKnownIntegral(const Value *V, const DataLayout &DL,
FastMathFlags FMF) {
if (isa<UndefValue>(V))
return true;
@ -587,6 +588,24 @@ static bool isKnownIntegral(const Value *V) {
return true;
}
const Instruction *I = dyn_cast<Instruction>(V);
if (!I)
return false;
switch (I->getOpcode()) {
case Instruction::SIToFP:
case Instruction::UIToFP:
// TODO: Could check nofpclass(inf) on incoming argument
if (FMF.noInfs())
return true;
// Need to check int size cannot produce infinity, which computeKnownFPClass
// knows how to do already.
return isKnownNeverInfinity(I, DL);
default:
break;
}
return false;
}
@ -1013,7 +1032,7 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
// We cannot handle corner cases for a general pow() function, give up
// unless y is a constant integral value. Then proceed as if it were pown.
if (!isKnownIntegral(opr1))
if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags()))
return false;
}

View File

@ -57,12 +57,95 @@ define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f16__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v1
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_cvt_f16_f32_e32 v43, v0
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_mul_f16_e32 v0, v0, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_i16_f16_e32 v1, v43
; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1
; CHECK-NEXT: v_and_b32_e32 v1, v1, v42
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v0, v1, v0
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to half
%pow = tail call fast half @_Z3powDhDh(half %x, half %y)
ret half %pow
@ -72,11 +155,93 @@ define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f32__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v1
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff@rel32@hi+12
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_cvt_f32_i32_e32 v43, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v43
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; CHECK-NEXT: v_and_or_b32 v0, v1, v42, v0
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
%pow = tail call fast float @_Z3powff(float %x, float %y)
ret float %pow
@ -86,11 +251,98 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f64__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v2
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12
; CHECK-NEXT: s_setpc_b64 s[16:17]
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v43, v1
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_mov_b32_e32 v42, v2
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[44:45], v42
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[44:45]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42
; CHECK-NEXT: v_and_b32_e32 v2, v2, v43
; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to double
%pow = tail call fast double @_Z3powdd(double %x, double %y)
ret double %pow

View File

@ -2208,8 +2208,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp(float %x, i32 %y)
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp
; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i32 [[Y]] to float
; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
; CHECK-NEXT: ret float [[POW]]
; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = sitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@ -2280,8 +2290,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp(float %x, i32 %y)
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp
; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i32 [[Y]] to float
; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
; CHECK-NEXT: ret float [[POW]]
; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = uitofp i32 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@ -2318,8 +2338,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256(float %x, i2
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_uitofp_i256
; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp i256 [[Y]] to float
; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
; CHECK-NEXT: ret float [[POW]]
; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = uitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@ -2330,8 +2360,18 @@ define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256(float %x, i2
; CHECK-LABEL: define float @test_pow_afn_nnan_ninf_f32_known_integral_sitofp_i256
; CHECK-SAME: (float [[X:%.*]], i256 [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp i256 [[Y]] to float
; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn float @_Z3powff(float [[X]], float [[Y_CAST]])
; CHECK-NEXT: ret float [[POW]]
; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn float @llvm.fabs.f32(float [[X]])
; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn float @_Z4log2f(float [[__FABS]])
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn float [[__LOG2]], [[Y_CAST]]
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn float @_Z4exp2f(float [[__YLOGX]])
; CHECK-NEXT: [[__YTOU:%.*]] = fptosi float [[Y_CAST]] to i32
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[__YTOU]], 31
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[__EXP2]] to i32
; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[__POW_SIGN]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32 [[TMP3]] to float
; CHECK-NEXT: ret float [[TMP4]]
;
%y.cast = sitofp i256 %y to float
%pow = tail call afn nnan ninf float @_Z3powff(float %x, float %y.cast)
@ -2342,8 +2382,18 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp(<2 x floa
; CHECK-LABEL: define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_sitofp
; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = sitofp <2 x i32> [[Y]] to <2 x float>
; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn <2 x float> @_Z3powDv2_fS_(<2 x float> [[X]], <2 x float> [[Y_CAST]])
; CHECK-NEXT: ret <2 x float> [[POW]]
; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32>
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[__POW_SIGN]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
; CHECK-NEXT: ret <2 x float> [[TMP4]]
;
%y.cast = sitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)
@ -2378,8 +2428,18 @@ define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp(<2 x floa
; CHECK-LABEL: define <2 x float> @test_pow_afn_nnan_ninf_v2f32_known_integral_uitofp
; CHECK-SAME: (<2 x float> [[X:%.*]], <2 x i32> [[Y:%.*]]) {
; CHECK-NEXT: [[Y_CAST:%.*]] = uitofp <2 x i32> [[Y]] to <2 x float>
; CHECK-NEXT: [[POW:%.*]] = tail call nnan ninf afn <2 x float> @_Z3powDv2_fS_(<2 x float> [[X]], <2 x float> [[Y_CAST]])
; CHECK-NEXT: ret <2 x float> [[POW]]
; CHECK-NEXT: [[__FABS:%.*]] = call nnan ninf afn <2 x float> @llvm.fabs.v2f32(<2 x float> [[X]])
; CHECK-NEXT: [[__LOG2:%.*]] = call nnan ninf afn <2 x float> @_Z4log2Dv2_f(<2 x float> [[__FABS]])
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn <2 x float> [[__LOG2]], [[Y_CAST]]
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn <2 x float> @_Z4exp2Dv2_f(<2 x float> [[__YLOGX]])
; CHECK-NEXT: [[__YTOU:%.*]] = fptosi <2 x float> [[Y_CAST]] to <2 x i32>
; CHECK-NEXT: [[__YEVEN:%.*]] = shl <2 x i32> [[__YTOU]], <i32 31, i32 31>
; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[X]] to <2 x i32>
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and <2 x i32> [[__YEVEN]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[__EXP2]] to <2 x i32>
; CHECK-NEXT: [[TMP3:%.*]] = or <2 x i32> [[__POW_SIGN]], [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[TMP3]] to <2 x float>
; CHECK-NEXT: ret <2 x float> [[TMP4]]
;
%y.cast = uitofp <2 x i32> %y to <2 x float>
%pow = tail call afn nnan ninf <2 x float> @_Z3powDv2_fS_(<2 x float> %x, <2 x float> %y.cast)