llvm-project/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

1476 lines
66 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-simplifylib,instcombine -amdgpu-prelink < %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-prelink | FileCheck %s
declare hidden float @_Z3powff(float, float)
declare hidden double @_Z3powdd(double, double)
declare hidden half @_Z3powDhDh(half, half)
declare hidden float @_Z4powrff(float, float)
declare hidden double @_Z4powrdd(double, double)
declare hidden half @_Z4powrDhDh(half, half)
declare hidden float @_Z4pownfi(float, i32)
declare hidden double @_Z4powndi(double, i32)
declare hidden half @_Z4pownDhi(half, i32)
; --------------------------------------------------------------------
; test pow
; --------------------------------------------------------------------
define half @test_pow_fast_f16(half %x, half %y) {
; CHECK-LABEL: test_pow_fast_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _Z3powDhDh@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powDhDh@rel32@hi+12
; CHECK-NEXT: s_setpc_b64 s[16:17]
%pow = tail call fast half @_Z3powDhDh(half %x, half %y)
ret half %pow
}
define float @test_pow_fast_f32(float %x, float %y) {
; CHECK-LABEL: test_pow_fast_f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _Z3powff@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powff@rel32@hi+12
; CHECK-NEXT: s_setpc_b64 s[16:17]
%pow = tail call fast float @_Z3powff(float %x, float %y)
ret float %pow
}
define double @test_pow_fast_f64(double %x, double %y) {
; CHECK-LABEL: test_pow_fast_f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_getpc_b64 s[16:17]
; CHECK-NEXT: s_add_u32 s16, s16, _Z3powdd@rel32@lo+4
; CHECK-NEXT: s_addc_u32 s17, s17, _Z3powdd@rel32@hi+12
; CHECK-NEXT: s_setpc_b64 s[16:17]
%pow = tail call fast double @_Z3powdd(double %x, double %y)
ret double %pow
}
define half @test_pow_fast_f16__integral_y(half %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f16__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_cvt_f32_i32_e32 v0, v1
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_cvt_f16_f32_e32 v43, v0
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_mul_f16_e32 v0, v0, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_i16_f16_e32 v1, v43
; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v1
; CHECK-NEXT: v_and_b32_e32 v1, v1, v42
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v0, v1, v0
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to half
%pow = tail call fast half @_Z3powDhDh(half %x, half %y)
ret half %pow
}
define float @test_pow_fast_f32__integral_y(float %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f32__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_cvt_f32_i32_e32 v43, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_i32_f32_e32 v1, v43
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v1
; CHECK-NEXT: v_and_or_b32 v0, v1, v42, v0
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to float
%pow = tail call fast float @_Z3powff(float %x, float %y)
ret float %pow
}
define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
; CHECK-LABEL: test_pow_fast_f64__integral_y:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v43, v1
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_mov_b32_e32 v42, v2
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[44:45], v42
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[44:45]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42
; CHECK-NEXT: v_and_b32_e32 v2, v2, v43
; CHECK-NEXT: buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = sitofp i32 %y.i to double
%pow = tail call fast double @_Z3powdd(double %x, double %y)
ret double %pow
}
; --------------------------------------------------------------------
; test powr
; --------------------------------------------------------------------
define half @test_powr_fast_f16(half %x, half %y) {
; CHECK-LABEL: test_powr_fast_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_mul_f16_e32 v0, v0, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%powr = tail call fast half @_Z4powrDhDh(half %x, half %y)
ret half %powr
}
define float @test_powr_fast_f32(float %x, float %y) {
; CHECK-LABEL: test_powr_fast_f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%powr = tail call fast float @_Z4powrff(float %x, float %y)
ret float %powr
}
define double @test_powr_fast_f64(double %x, double %y) {
; CHECK-LABEL: test_powr_fast_f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v43, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v3
; CHECK-NEXT: v_mov_b32_e32 v41, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[41:42]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v43
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%powr = tail call fast double @_Z4powrdd(double %x, double %y)
ret double %powr
}
; --------------------------------------------------------------------
; test pown
; --------------------------------------------------------------------
define half @test_pown_fast_f16(half %x, i32 %y) {
; CHECK-LABEL: test_pown_fast_f16:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v43, v0
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b16_e32 v1, 15, v42
; CHECK-NEXT: v_and_b32_e32 v1, v1, v43
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v0, v1, v0
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
ret half %call
}
define float @test_pown_fast_f32(float %x, i32 %y) {
; CHECK-LABEL: test_pown_fast_f32:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v43, v0
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v1, 31, v42
; CHECK-NEXT: v_and_or_b32 v0, v1, v43, v0
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%call = tail call fast float @_Z4pownfi(float %x, i32 %y)
ret float %call
}
define double @test_pown_fast_f64(double %x, i32 %y) {
; CHECK-LABEL: test_pown_fast_f64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v43, v1
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v43
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_mov_b32_e32 v42, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v42
; CHECK-NEXT: v_and_b32_e32 v2, v2, v43
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%call = tail call fast double @_Z4powndi(double %x, i32 %y)
ret double %call
}
define half @test_pown_fast_f16_known_even(half %x, i32 %y.arg) {
; CHECK-LABEL: test_pown_fast_f16_known_even:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = shl i32 %y.arg, 1
%call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
ret half %call
}
define float @test_pown_fast_f32_known_even(float %x, i32 %y.arg) {
; CHECK-LABEL: test_pown_fast_f32_known_even:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v42
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = shl i32 %y.arg, 1
%call = tail call fast float @_Z4pownfi(float %x, i32 %y)
ret float %call
}
define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
; CHECK-LABEL: test_pown_fast_f64_known_even:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x400
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_lshlrev_b32_e32 v42, 1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v42
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xfc00
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = shl i32 %y.arg, 1
%call = tail call fast double @_Z4powndi(double %x, i32 %y)
ret double %call
}
define half @test_pown_fast_f16_known_odd(half %x, i32 %y.arg) {
; CHECK-LABEL: test_pown_fast_f16_known_odd:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_or_b32_e32 v43, 1, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v43
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2Dh@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2Dh@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_cvt_f16_f32_e32 v1, v1
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: v_mul_f16_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff8000, v42
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v0, v1, v0
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%call = tail call fast half @_Z4pownDhi(half %x, i32 %y)
ret half %call
}
define float @test_pown_fast_f32_known_odd(float %x, i32 %y.arg) {
; CHECK-LABEL: test_pown_fast_f32_known_odd:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2f@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v0
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v0, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_or_b32_e32 v43, 1, v1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2f@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2f@gotpcrel32@hi+12
; CHECK-NEXT: v_cvt_f32_i32_e32 v1, v43
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: v_mul_f32_e32 v0, v0, v1
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_brev_b32 s4, 1
; CHECK-NEXT: v_and_or_b32 v0, v42, s4, v0
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%call = tail call fast float @_Z4pownfi(float %x, i32 %y)
ret float %call
}
define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
; CHECK-LABEL: test_pown_fast_f64_known_odd:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_mov_b32 s16, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1
; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[18:19]
; CHECK-NEXT: v_writelane_b32 v40, s16, 14
; CHECK-NEXT: v_writelane_b32 v40, s30, 0
; CHECK-NEXT: v_writelane_b32 v40, s31, 1
; CHECK-NEXT: v_writelane_b32 v40, s34, 2
; CHECK-NEXT: v_writelane_b32 v40, s35, 3
; CHECK-NEXT: v_writelane_b32 v40, s36, 4
; CHECK-NEXT: v_writelane_b32 v40, s37, 5
; CHECK-NEXT: v_writelane_b32 v40, s38, 6
; CHECK-NEXT: v_writelane_b32 v40, s39, 7
; CHECK-NEXT: s_addk_i32 s32, 0x800
; CHECK-NEXT: v_writelane_b32 v40, s40, 8
; CHECK-NEXT: v_writelane_b32 v40, s41, 9
; CHECK-NEXT: s_mov_b64 s[40:41], s[4:5]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: v_writelane_b32 v40, s42, 10
; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; CHECK-NEXT: v_writelane_b32 v40, s43, 11
; CHECK-NEXT: v_mov_b32_e32 v42, v1
; CHECK-NEXT: v_writelane_b32 v40, s44, 12
; CHECK-NEXT: v_and_b32_e32 v1, 0x7fffffff, v42
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_writelane_b32 v40, s45, 13
; CHECK-NEXT: v_mov_b32_e32 v41, v31
; CHECK-NEXT: s_mov_b32 s42, s15
; CHECK-NEXT: s_mov_b32 s43, s14
; CHECK-NEXT: s_mov_b32 s44, s13
; CHECK-NEXT: s_mov_b32 s45, s12
; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
; CHECK-NEXT: v_or_b32_e32 v43, 1, v2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_cvt_f64_i32_e32 v[2:3], v43
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
; CHECK-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41]
; CHECK-NEXT: v_mul_f64 v[0:1], v[0:1], v[2:3]
; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
; CHECK-NEXT: s_mov_b32 s12, s45
; CHECK-NEXT: s_mov_b32 s13, s44
; CHECK-NEXT: s_mov_b32 s14, s43
; CHECK-NEXT: s_mov_b32 s15, s42
; CHECK-NEXT: v_mov_b32_e32 v31, v41
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_and_b32_e32 v2, 0x80000000, v42
; CHECK-NEXT: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; CHECK-NEXT: v_or_b32_e32 v1, v2, v1
; CHECK-NEXT: v_readlane_b32 s45, v40, 13
; CHECK-NEXT: v_readlane_b32 s44, v40, 12
; CHECK-NEXT: v_readlane_b32 s43, v40, 11
; CHECK-NEXT: v_readlane_b32 s42, v40, 10
; CHECK-NEXT: v_readlane_b32 s41, v40, 9
; CHECK-NEXT: v_readlane_b32 s40, v40, 8
; CHECK-NEXT: v_readlane_b32 s39, v40, 7
; CHECK-NEXT: v_readlane_b32 s38, v40, 6
; CHECK-NEXT: v_readlane_b32 s37, v40, 5
; CHECK-NEXT: v_readlane_b32 s36, v40, 4
; CHECK-NEXT: v_readlane_b32 s35, v40, 3
; CHECK-NEXT: v_readlane_b32 s34, v40, 2
; CHECK-NEXT: v_readlane_b32 s31, v40, 1
; CHECK-NEXT: v_readlane_b32 s30, v40, 0
; CHECK-NEXT: v_readlane_b32 s4, v40, 14
; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[6:7]
; CHECK-NEXT: s_addk_i32 s32, 0xf800
; CHECK-NEXT: s_mov_b32 s33, s4
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
%y = or i32 %y.arg, 1
%call = tail call fast double @_Z4powndi(double %x, i32 %y)
ret double %call
}