llvm-project/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
Joe Nash b4b8d4e5d0
[AMDGPU] Fix VOPD checks for commuting OpX and OpY (#178772)
We need to check that OpX does not write the sources of OpY, but if we
swap OpX and OpY with respect to program order, the check was not
swapped correctly.

The checks on gfx1250 can be relaxed slightly, that is planned for a
future patch.

---------

Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
2026-01-30 15:13:33 +00:00

7297 lines
311 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -enable-var-scope --check-prefixes=SI %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope --check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope --check-prefixes=GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s
declare half @llvm.copysign.f16(half, half) #0
declare float @llvm.copysign.f32(float, float) #0
declare double @llvm.copysign.f64(double, double) #0
declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) #0
declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) #0
declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s1, s1, 0xffff8000
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_movk_i32 s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_movk_i32 s2, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v1, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%out = call half @llvm.copysign.f16(half %mag, half %sign)
%cast = bitcast half %out to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_0(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_0:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_0:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x7fff
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half %mag, half 0.0)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_1(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_1:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_1:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x7fff
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half %mag, half 1.0)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_10.0:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_10.0:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x7fff
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_10.0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_10.0:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half %mag, half 10.0)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_or_b32 s0, s0, 0xffff8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_bitset1_b32 s0, 15
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_neg1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_bitset1_b32 s0, 15
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_neg1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_bitset1_b32 s0, 15
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half %mag, half -1.0)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_neg10:
; SI: ; %bb.0:
; SI-NEXT: s_or_b32 s0, s0, 0xffff8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg10:
; VI: ; %bb.0:
; VI-NEXT: s_bitset1_b32 s0, 15
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_neg10:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_bitset1_b32 s0, 15
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_neg10:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_bitset1_b32 s0, 15
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half %mag, half -10.0)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_0_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_0_mag:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0x8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_0_mag:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x8000
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_0_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_0_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half 0.0, half %sign)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_1_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_1_mag:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0xffff8000
; SI-NEXT: s_or_b32 s0, s0, 0x3c00
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_1_mag:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x8000
; VI-NEXT: s_or_b32 s0, s0, 0x3c00
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_1_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
; GFX9-NEXT: s_or_b32 s0, s0, 0x3c00
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 0x3c00
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half 1.0, half %sign)
%cast = bitcast half %result to i16
ret i16 %cast
}
define i16 @s_test_copysign_f16_10_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_10_mag:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_and_b32 s4, s16, 0xffff8000
; SI-NEXT: s_or_b32 s4, s4, 0x4900
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: s_test_copysign_f16_10_mag:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_and_b32 s4, s16, 0x8000
; VI-NEXT: s_or_b32 s4, s4, 0x4900
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_test_copysign_f16_10_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s16, 0x8000
; GFX9-NEXT: s_or_b32 s4, s4, 0x4900
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: s_test_copysign_f16_10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 0x4900
; GFX11-NEXT: v_mov_b32_e32 v0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half 10.0, half %sign)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_neg1_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_neg1_mag:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0xffff8000
; SI-NEXT: s_or_b32 s0, s0, 0x3c00
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg1_mag:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x8000
; VI-NEXT: s_or_b32 s0, s0, 0x3c00
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_neg1_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
; GFX9-NEXT: s_or_b32 s0, s0, 0x3c00
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_neg1_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 0x3c00
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half -1.0, half %sign)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_test_copysign_f16_neg10_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_neg10_mag:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0xffff8000
; SI-NEXT: s_or_b32 s0, s0, 0x4900
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg10_mag:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x8000
; VI-NEXT: s_or_b32 s0, s0, 0x4900
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_test_copysign_f16_neg10_mag:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
; GFX9-NEXT: s_or_b32 s0, s0, 0x4900
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_test_copysign_f16_neg10_mag:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s0, 0x4900
; GFX11-NEXT: ; return to shader part epilog
%result = call half @llvm.copysign.f16(half -10.0, half %sign)
%cast = bitcast half %result to i16
ret i16 %cast
}
define half @v_copysign_f16(half %mag, half %sign) {
; SI-LABEL: v_copysign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: s_movk_i32 s4, 0x7fff
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half %sign)
ret half %result
}
define half @v_test_copysign_f16_0(half %mag) {
; SI-LABEL: v_test_copysign_f16_0:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_0:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_copysign_f16_0:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_test_copysign_f16_0:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_copysign_f16_0:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 0.0)
ret half %result
}
define half @v_test_copysign_f16_1(half %mag) {
; SI-LABEL: v_test_copysign_f16_1:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_1:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_copysign_f16_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_test_copysign_f16_1:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_copysign_f16_1:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 1.0)
ret half %result
}
define half @v_test_copysign_f16_10(half %mag) {
; SI-LABEL: v_test_copysign_f16_10:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_10:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_copysign_f16_10:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_test_copysign_f16_10:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_copysign_f16_10:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half 10.0)
ret half %result
}
define half @v_test_copysign_f16_neg1(half %mag) {
; SI-LABEL: v_test_copysign_f16_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_copysign_f16_neg1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_test_copysign_f16_neg1:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_copysign_f16_neg1:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half -1.0)
ret half %result
}
define half @v_test_copysign_f16_neg10(half %mag) {
; SI-LABEL: v_test_copysign_f16_neg10:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_neg10:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_or_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_copysign_f16_neg10:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_test_copysign_f16_neg10:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_test_copysign_f16_neg10:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%result = call half @llvm.copysign.f16(half %mag, half -10.0)
ret half %result
}
define float @v_copysign_out_f32_mag_f16_sign_f32(half %mag, float %sign) {
; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.ext = fpext half %mag to float
%out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
ret float %out
}
define double @v_copysign_out_f64_mag_f16_sign_f64(half %mag, double %sign) {
; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: v_bfi_b32 v1, s4, v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.ext = fpext half %mag to double
%out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
ret double %out
}
define float @v_copysign_out_f32_mag_f32_sign_f16(float %mag, half %sign) {
; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext half %sign to float
%out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
ret float %out
}
define double @v_copysign_out_f64_mag_f64_sign_f16(double %mag, half %sign) {
; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext half %sign to double
%out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
ret double %out
}
define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) {
; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc float %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
ret half %out
}
define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) {
; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v2
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc double %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
ret half %out
}
define half @v_copysign_out_f16_mag_f32_sign_f16(float %mag, half %sign) {
; SI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_movk_i32 s4, 0x7fff
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc float %mag to half
%out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
ret half %out
}
define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
; SI-LABEL: v_copysign_out_f16_mag_f64_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v1
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; SI-NEXT: v_bfe_u32 v1, v1, 20, 11
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v1
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; SI-NEXT: v_med3_i32 v4, v4, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3
; SI-NEXT: s_movk_i32 s4, 0xfc10
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v1, vcc, s4, v1
; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v1
; SI-NEXT: v_or_b32_e32 v3, v5, v3
; SI-NEXT: v_or_b32_e32 v4, v0, v4
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; SI-NEXT: v_and_b32_e32 v4, 7, v3
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v4, v4, v5
; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v4
; SI-NEXT: v_mov_b32_e32 v4, 0x7c00
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; SI-NEXT: v_mov_b32_e32 v5, 0x7e00
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: s_movk_i32 s4, 0x40f
; SI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; SI-NEXT: s_movk_i32 s4, 0x7fff
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v3, 0x1ff, v1
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: v_sub_u32_e32 v4, vcc, 0x3f1, v1
; VI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; VI-NEXT: v_med3_i32 v4, v4, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v5, v4, v3
; VI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3
; VI-NEXT: s_movk_i32 s4, 0xfc10
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, s4, v1
; VI-NEXT: v_lshlrev_b32_e32 v4, 12, v1
; VI-NEXT: v_or_b32_e32 v3, v5, v3
; VI-NEXT: v_or_b32_e32 v4, v0, v4
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; VI-NEXT: v_and_b32_e32 v4, 7, v3
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v4, v4, v5
; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v4
; VI-NEXT: v_mov_b32_e32 v4, 0x7c00
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, 0x7e00
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: s_movk_i32 s4, 0x40f
; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_f16_mag_f64_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x1ff
; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX9-NEXT: s_movk_i32 s4, 0xffe
; GFX9-NEXT: v_bfe_u32 v1, v1, 20, 11
; GFX9-NEXT: v_and_or_b32 v0, v3, s4, v0
; GFX9-NEXT: v_sub_u32_e32 v4, 0x3f1, v1
; GFX9-NEXT: v_or_b32_e32 v3, 0x1000, v0
; GFX9-NEXT: v_med3_i32 v4, v4, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v5, v4, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v4, v4, v5
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v4, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v1, 0xfffffc10, v1
; GFX9-NEXT: v_or_b32_e32 v3, v5, v3
; GFX9-NEXT: v_lshl_or_b32 v4, v1, 12, v0
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX9-NEXT: v_and_b32_e32 v4, 7, v3
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v4
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v4, v4, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; GFX9-NEXT: v_add_u32_e32 v3, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v4, 0x7c00
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_movk_i32 s4, 0x40f
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v1, v1, 20, 11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v1
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v1, 0xfffffc10, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v3, v0
; GFX11-TRUE16-NEXT: v_med3_i32 v3, v4, 0, 13
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, 0x1000, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v3, v5
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v4, v1, 12, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v3
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 7, v3
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v5, 0x7e00
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v5 :: v_dual_add_nc_u32 v3, v3, v4
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; GFX11-FAKE16-NEXT: v_bfe_u32 v1, v1, 20, 11
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v1
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v1, 0xfffffc10, v1
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v3, v0
; GFX11-FAKE16-NEXT: v_med3_i32 v3, v4, 0, 13
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, 0x1000, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, v3, v5
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v3, v4
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v4, v1, 12, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v3, v5, v3
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 7, v3
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v4, v4, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v3, v3, v4
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v4, 0x7e00
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc double %mag to half
%result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
ret half %result
}
define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s3, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s3, s0
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; SI-NEXT: s_lshr_b32 s0, s1, 8
; SI-NEXT: s_bfe_u32 s1, s1, 0xb0014
; SI-NEXT: s_and_b32 s0, s0, 0xffe
; SI-NEXT: v_readfirstlane_b32 s3, v0
; SI-NEXT: s_sub_i32 s4, 0x3f1, s1
; SI-NEXT: s_or_b32 s0, s0, s3
; SI-NEXT: v_med3_i32 v0, s4, 0, 13
; SI-NEXT: s_or_b32 s3, s0, 0x1000
; SI-NEXT: v_readfirstlane_b32 s4, v0
; SI-NEXT: s_lshr_b32 s5, s3, s4
; SI-NEXT: s_lshl_b32 s4, s5, s4
; SI-NEXT: s_cmp_lg_u32 s4, s3
; SI-NEXT: s_cselect_b32 s3, 1, 0
; SI-NEXT: s_addk_i32 s1, 0xfc10
; SI-NEXT: s_lshl_b32 s4, s1, 12
; SI-NEXT: s_or_b32 s3, s5, s3
; SI-NEXT: s_or_b32 s4, s0, s4
; SI-NEXT: s_cmp_lt_i32 s1, 1
; SI-NEXT: s_cselect_b32 s3, s3, s4
; SI-NEXT: s_and_b32 s4, s3, 7
; SI-NEXT: s_cmp_gt_i32 s4, 5
; SI-NEXT: s_cselect_b32 s5, 1, 0
; SI-NEXT: s_cmp_eq_u32 s4, 3
; SI-NEXT: s_cselect_b32 s4, 1, 0
; SI-NEXT: s_or_b32 s4, s4, s5
; SI-NEXT: s_lshr_b32 s3, s3, 2
; SI-NEXT: s_add_i32 s3, s3, s4
; SI-NEXT: s_cmp_lt_i32 s1, 31
; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_movk_i32 s0, 0x7e00
; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00
; SI-NEXT: s_cmpk_eq_i32 s1, 0x40f
; SI-NEXT: s_cselect_b32 s0, s0, s3
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: s_and_b32 s1, s2, 0xffff8000
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s3, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s3, s0
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; VI-NEXT: s_lshr_b32 s0, s1, 8
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
; VI-NEXT: s_and_b32 s0, s0, 0xffe
; VI-NEXT: v_readfirstlane_b32 s3, v0
; VI-NEXT: s_sub_i32 s4, 0x3f1, s1
; VI-NEXT: s_or_b32 s0, s0, s3
; VI-NEXT: v_med3_i32 v0, s4, 0, 13
; VI-NEXT: s_or_b32 s3, s0, 0x1000
; VI-NEXT: v_readfirstlane_b32 s4, v0
; VI-NEXT: s_lshr_b32 s5, s3, s4
; VI-NEXT: s_lshl_b32 s4, s5, s4
; VI-NEXT: s_cmp_lg_u32 s4, s3
; VI-NEXT: s_cselect_b32 s3, 1, 0
; VI-NEXT: s_addk_i32 s1, 0xfc10
; VI-NEXT: s_lshl_b32 s4, s1, 12
; VI-NEXT: s_or_b32 s3, s5, s3
; VI-NEXT: s_or_b32 s4, s0, s4
; VI-NEXT: s_cmp_lt_i32 s1, 1
; VI-NEXT: s_cselect_b32 s3, s3, s4
; VI-NEXT: s_and_b32 s4, s3, 7
; VI-NEXT: s_cmp_gt_i32 s4, 5
; VI-NEXT: s_cselect_b32 s5, 1, 0
; VI-NEXT: s_cmp_eq_u32 s4, 3
; VI-NEXT: s_cselect_b32 s4, 1, 0
; VI-NEXT: s_or_b32 s4, s4, s5
; VI-NEXT: s_lshr_b32 s3, s3, 2
; VI-NEXT: s_add_i32 s3, s3, s4
; VI-NEXT: s_cmp_lt_i32 s1, 31
; VI-NEXT: s_cselect_b32 s3, s3, 0x7c00
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_movk_i32 s0, 0x7e00
; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00
; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f
; VI-NEXT: s_cselect_b32 s0, s0, s3
; VI-NEXT: s_movk_i32 s1, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_bfi_b32 v0, s1, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s3, s0
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
; GFX9-NEXT: s_bfe_u32 s1, s1, 0xb0014
; GFX9-NEXT: s_and_b32 s0, s0, 0xffe
; GFX9-NEXT: v_readfirstlane_b32 s3, v0
; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s1
; GFX9-NEXT: s_or_b32 s0, s0, s3
; GFX9-NEXT: v_med3_i32 v0, s4, 0, 13
; GFX9-NEXT: s_or_b32 s3, s0, 0x1000
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_lshr_b32 s5, s3, s4
; GFX9-NEXT: s_lshl_b32 s4, s5, s4
; GFX9-NEXT: s_cmp_lg_u32 s4, s3
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
; GFX9-NEXT: s_addk_i32 s1, 0xfc10
; GFX9-NEXT: s_lshl_b32 s4, s1, 12
; GFX9-NEXT: s_or_b32 s3, s5, s3
; GFX9-NEXT: s_or_b32 s4, s0, s4
; GFX9-NEXT: s_cmp_lt_i32 s1, 1
; GFX9-NEXT: s_cselect_b32 s3, s3, s4
; GFX9-NEXT: s_and_b32 s4, s3, 7
; GFX9-NEXT: s_cmp_gt_i32 s4, 5
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s4, 3
; GFX9-NEXT: s_cselect_b32 s4, 1, 0
; GFX9-NEXT: s_or_b32 s4, s4, s5
; GFX9-NEXT: s_lshr_b32 s3, s3, 2
; GFX9-NEXT: s_add_i32 s3, s3, s4
; GFX9-NEXT: s_cmp_lt_i32 s1, 31
; GFX9-NEXT: s_cselect_b32 s3, s3, 0x7c00
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_movk_i32 s0, 0x7e00
; GFX9-NEXT: s_cselect_b32 s0, s0, 0x7c00
; GFX9-NEXT: s_cmpk_eq_i32 s1, 0x40f
; GFX9-NEXT: s_cselect_b32 s0, s0, s3
; GFX9-NEXT: s_movk_i32 s1, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s0, s3, s0
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-TRUE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-TRUE16-NEXT: s_lshr_b32 s1, s1, 8
; GFX11-TRUE16-NEXT: s_sub_i32 s3, 0x3f1, s0
; GFX11-TRUE16-NEXT: s_and_b32 s1, s1, 0xffe
; GFX11-TRUE16-NEXT: v_med3_i32 v1, s3, 0, 13
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-TRUE16-NEXT: s_or_b32 s1, s1, s3
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2
; GFX11-TRUE16-NEXT: s_or_b32 s3, s1, 0x1000
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s3, s4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s5, s4
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s4, s3
; GFX11-TRUE16-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-TRUE16-NEXT: s_addk_i32 s0, 0xfc10
; GFX11-TRUE16-NEXT: s_or_b32 s3, s5, s3
; GFX11-TRUE16-NEXT: s_lshl_b32 s4, s0, 12
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_or_b32 s4, s1, s4
; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 1
; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, s4
; GFX11-TRUE16-NEXT: s_and_b32 s4, s3, 7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_cmp_gt_i32 s4, 5
; GFX11-TRUE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-TRUE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s3, 2
; GFX11-TRUE16-NEXT: s_or_b32 s4, s4, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: s_add_i32 s3, s3, s4
; GFX11-TRUE16-NEXT: s_cmp_lt_i32 s0, 31
; GFX11-TRUE16-NEXT: s_movk_i32 s4, 0x7e00
; GFX11-TRUE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-TRUE16-NEXT: s_cselect_b32 s1, s4, 0x7c00
; GFX11-TRUE16-NEXT: s_cmpk_eq_i32 s0, 0x40f
; GFX11-TRUE16-NEXT: s_cselect_b32 s0, s1, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_and_b32 s3, s1, 0x1ff
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_or_b32 s0, s3, s0
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-FAKE16-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-FAKE16-NEXT: s_lshr_b32 s1, s1, 8
; GFX11-FAKE16-NEXT: s_sub_i32 s3, 0x3f1, s0
; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 0xffe
; GFX11-FAKE16-NEXT: v_med3_i32 v1, s3, 0, 13
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-FAKE16-NEXT: s_or_b32 s1, s1, s3
; GFX11-FAKE16-NEXT: s_or_b32 s3, s1, 0x1000
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_lshr_b32 s5, s3, s4
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s5, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s4, s3
; GFX11-FAKE16-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-FAKE16-NEXT: s_addk_i32 s0, 0xfc10
; GFX11-FAKE16-NEXT: s_or_b32 s3, s5, s3
; GFX11-FAKE16-NEXT: s_lshl_b32 s4, s0, 12
; GFX11-FAKE16-NEXT: s_or_b32 s4, s1, s4
; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 1
; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, s4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_and_b32 s4, s3, 7
; GFX11-FAKE16-NEXT: s_cmp_gt_i32 s4, 5
; GFX11-FAKE16-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s4, 3
; GFX11-FAKE16-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s3, 2
; GFX11-FAKE16-NEXT: s_or_b32 s4, s4, s5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-FAKE16-NEXT: s_add_i32 s3, s3, s4
; GFX11-FAKE16-NEXT: s_cmp_lt_i32 s0, 31
; GFX11-FAKE16-NEXT: s_movk_i32 s4, 0x7e00
; GFX11-FAKE16-NEXT: s_cselect_b32 s3, s3, 0x7c00
; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0
; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s4, 0x7c00
; GFX11-FAKE16-NEXT: s_cmpk_eq_i32 s0, 0x40f
; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s1, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%mag.trunc = fptrunc double %mag to half
%result = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
%cast = bitcast half %result to i16
ret i16 %cast
}
define amdgpu_ps i32 @s_copysign_v2f16(<2 x half> inreg %arg_mag, <2 x half> inreg %arg_sign) {
; SI-LABEL: s_copysign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s2, s1, 16
; SI-NEXT: s_and_b32 s3, s0, 0x7fff
; SI-NEXT: s_and_b32 s2, s2, 0x8000
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: s_and_b32 s1, s1, 0x8000
; SI-NEXT: s_or_b32 s0, s0, s2
; SI-NEXT: s_or_b32 s1, s3, s1
; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_or_b32 s0, s1, s0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s2, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v0, s1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %arg_mag, <2 x half> %arg_sign)
%cast = bitcast <2 x half> %out to i32
ret i32 %cast
}
define amdgpu_ps <3 x i16> @s_copysign_v3f16(<3 x half> inreg %arg_mag, <3 x half> inreg %arg_sign) {
; SI-LABEL: s_copysign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s4, s2, 0x8000
; SI-NEXT: s_lshr_b32 s2, s2, 16
; SI-NEXT: s_and_b32 s5, s0, 0x7fff
; SI-NEXT: s_and_b32 s2, s2, 0x8000
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: s_or_b32 s0, s0, s2
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s3, s3, 0x8000
; SI-NEXT: s_and_b32 s1, s1, 0x7fff
; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_or_b32 s1, s1, s3
; SI-NEXT: s_or_b32 s0, s4, s0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: v_readfirstlane_b32 s0, v1
; VI-NEXT: v_readfirstlane_b32 s1, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v3f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %arg_mag, <3 x half> %arg_sign)
%cast = bitcast <3 x half> %out to <3 x i16>
ret <3 x i16> %cast
}
define amdgpu_ps <2 x i32> @s_copysign_v4f16(<4 x half> inreg %arg_mag, <4 x half> inreg %arg_sign) {
; SI-LABEL: s_copysign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s5, s3, 16
; SI-NEXT: s_and_b32 s6, s1, 0x7fff
; SI-NEXT: s_and_b32 s5, s5, 0x8000
; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010
; SI-NEXT: s_and_b32 s3, s3, 0x8000
; SI-NEXT: s_or_b32 s1, s1, s5
; SI-NEXT: s_or_b32 s3, s6, s3
; SI-NEXT: s_lshl_b32 s1, s1, 16
; SI-NEXT: s_lshr_b32 s4, s2, 16
; SI-NEXT: s_or_b32 s1, s3, s1
; SI-NEXT: s_and_b32 s2, s2, 0x8000
; SI-NEXT: s_and_b32 s3, s0, 0x7fff
; SI-NEXT: s_or_b32 s2, s3, s2
; SI-NEXT: s_and_b32 s3, s4, 0x8000
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: s_or_b32 s0, s0, s3
; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_or_b32 s0, s2, s0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: v_readfirstlane_b32 s0, v1
; VI-NEXT: v_readfirstlane_b32 s1, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_readfirstlane_b32 s0, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %arg_mag, <4 x half> %arg_sign)
%cast = bitcast <4 x half> %out to <2 x i32>
ret <2 x i32> %cast
}
define amdgpu_ps <4 x i32> @s_copysign_v8f16(<8 x half> inreg %arg_mag, <8 x half> inreg %arg_sign) {
; SI-LABEL: s_copysign_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s11, s7, 16
; SI-NEXT: s_and_b32 s12, s3, 0x7fff
; SI-NEXT: s_and_b32 s11, s11, 0x8000
; SI-NEXT: s_bfe_u32 s3, s3, 0xf0010
; SI-NEXT: s_and_b32 s7, s7, 0x8000
; SI-NEXT: s_or_b32 s3, s3, s11
; SI-NEXT: s_or_b32 s7, s12, s7
; SI-NEXT: s_lshl_b32 s3, s3, 16
; SI-NEXT: s_lshr_b32 s10, s6, 16
; SI-NEXT: s_or_b32 s3, s7, s3
; SI-NEXT: s_and_b32 s6, s6, 0x8000
; SI-NEXT: s_and_b32 s7, s2, 0x7fff
; SI-NEXT: s_or_b32 s6, s7, s6
; SI-NEXT: s_and_b32 s7, s10, 0x8000
; SI-NEXT: s_bfe_u32 s2, s2, 0xf0010
; SI-NEXT: s_or_b32 s2, s2, s7
; SI-NEXT: s_lshl_b32 s2, s2, 16
; SI-NEXT: s_lshr_b32 s9, s5, 16
; SI-NEXT: s_or_b32 s2, s6, s2
; SI-NEXT: s_and_b32 s5, s5, 0x8000
; SI-NEXT: s_and_b32 s6, s1, 0x7fff
; SI-NEXT: s_or_b32 s5, s6, s5
; SI-NEXT: s_and_b32 s6, s9, 0x8000
; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010
; SI-NEXT: s_or_b32 s1, s1, s6
; SI-NEXT: s_lshl_b32 s1, s1, 16
; SI-NEXT: s_lshr_b32 s8, s4, 16
; SI-NEXT: s_or_b32 s1, s5, s1
; SI-NEXT: s_and_b32 s4, s4, 0x8000
; SI-NEXT: s_and_b32 s5, s0, 0x7fff
; SI-NEXT: s_or_b32 s4, s5, s4
; SI-NEXT: s_and_b32 s5, s8, 0x8000
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: s_or_b32 s0, s0, s5
; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_or_b32 s0, s4, s0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s8, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_bfi_b32 v0, s8, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_bfi_b32 v1, s8, v1, v2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_bfi_b32 v2, s8, v2, v3
; VI-NEXT: v_mov_b32_e32 v3, s0
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_bfi_b32 v3, s8, v3, v4
; VI-NEXT: v_readfirstlane_b32 s0, v3
; VI-NEXT: v_readfirstlane_b32 s1, v2
; VI-NEXT: v_readfirstlane_b32 s2, v1
; VI-NEXT: v_readfirstlane_b32 s3, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v8f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s8, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_bfi_b32 v0, s8, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_bfi_b32 v1, s8, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_bfi_b32 v2, s8, v2, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_bfi_b32 v3, s8, v3, v4
; GFX9-NEXT: v_readfirstlane_b32 s0, v3
; GFX9-NEXT: v_readfirstlane_b32 s1, v2
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_readfirstlane_b32 s3, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v8f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s2, v2
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_readfirstlane_b32 s2, v2
; GFX11-NEXT: v_readfirstlane_b32 s3, v3
; GFX11-NEXT: ; return to shader part epilog
%out = call <8 x half> @llvm.copysign.v8f16(<8 x half> %arg_mag, <8 x half> %arg_sign)
%cast = bitcast <8 x half> %out to <4 x i32>
ret <4 x i32> %cast
}
define amdgpu_ps <8 x i32> @s_copysign_v16f16(<16 x half> inreg %arg_mag, <16 x half> inreg %arg_sign) {
; SI-LABEL: s_copysign_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s23, s15, 16
; SI-NEXT: s_and_b32 s24, s7, 0x7fff
; SI-NEXT: s_and_b32 s23, s23, 0x8000
; SI-NEXT: s_bfe_u32 s7, s7, 0xf0010
; SI-NEXT: s_and_b32 s15, s15, 0x8000
; SI-NEXT: s_or_b32 s7, s7, s23
; SI-NEXT: s_or_b32 s15, s24, s15
; SI-NEXT: s_lshl_b32 s7, s7, 16
; SI-NEXT: s_lshr_b32 s22, s14, 16
; SI-NEXT: s_or_b32 s7, s15, s7
; SI-NEXT: s_and_b32 s14, s14, 0x8000
; SI-NEXT: s_and_b32 s15, s6, 0x7fff
; SI-NEXT: s_or_b32 s14, s15, s14
; SI-NEXT: s_and_b32 s15, s22, 0x8000
; SI-NEXT: s_bfe_u32 s6, s6, 0xf0010
; SI-NEXT: s_or_b32 s6, s6, s15
; SI-NEXT: s_lshl_b32 s6, s6, 16
; SI-NEXT: s_lshr_b32 s21, s13, 16
; SI-NEXT: s_or_b32 s6, s14, s6
; SI-NEXT: s_and_b32 s13, s13, 0x8000
; SI-NEXT: s_and_b32 s14, s5, 0x7fff
; SI-NEXT: s_or_b32 s13, s14, s13
; SI-NEXT: s_and_b32 s14, s21, 0x8000
; SI-NEXT: s_bfe_u32 s5, s5, 0xf0010
; SI-NEXT: s_or_b32 s5, s5, s14
; SI-NEXT: s_lshl_b32 s5, s5, 16
; SI-NEXT: s_lshr_b32 s20, s12, 16
; SI-NEXT: s_or_b32 s5, s13, s5
; SI-NEXT: s_and_b32 s12, s12, 0x8000
; SI-NEXT: s_and_b32 s13, s4, 0x7fff
; SI-NEXT: s_or_b32 s12, s13, s12
; SI-NEXT: s_and_b32 s13, s20, 0x8000
; SI-NEXT: s_bfe_u32 s4, s4, 0xf0010
; SI-NEXT: s_or_b32 s4, s4, s13
; SI-NEXT: s_lshl_b32 s4, s4, 16
; SI-NEXT: s_lshr_b32 s19, s11, 16
; SI-NEXT: s_or_b32 s4, s12, s4
; SI-NEXT: s_and_b32 s11, s11, 0x8000
; SI-NEXT: s_and_b32 s12, s3, 0x7fff
; SI-NEXT: s_or_b32 s11, s12, s11
; SI-NEXT: s_and_b32 s12, s19, 0x8000
; SI-NEXT: s_bfe_u32 s3, s3, 0xf0010
; SI-NEXT: s_or_b32 s3, s3, s12
; SI-NEXT: s_lshl_b32 s3, s3, 16
; SI-NEXT: s_lshr_b32 s18, s10, 16
; SI-NEXT: s_or_b32 s3, s11, s3
; SI-NEXT: s_and_b32 s10, s10, 0x8000
; SI-NEXT: s_and_b32 s11, s2, 0x7fff
; SI-NEXT: s_or_b32 s10, s11, s10
; SI-NEXT: s_and_b32 s11, s18, 0x8000
; SI-NEXT: s_bfe_u32 s2, s2, 0xf0010
; SI-NEXT: s_or_b32 s2, s2, s11
; SI-NEXT: s_lshl_b32 s2, s2, 16
; SI-NEXT: s_lshr_b32 s17, s9, 16
; SI-NEXT: s_or_b32 s2, s10, s2
; SI-NEXT: s_and_b32 s9, s9, 0x8000
; SI-NEXT: s_and_b32 s10, s1, 0x7fff
; SI-NEXT: s_or_b32 s9, s10, s9
; SI-NEXT: s_and_b32 s10, s17, 0x8000
; SI-NEXT: s_bfe_u32 s1, s1, 0xf0010
; SI-NEXT: s_or_b32 s1, s1, s10
; SI-NEXT: s_lshl_b32 s1, s1, 16
; SI-NEXT: s_lshr_b32 s16, s8, 16
; SI-NEXT: s_or_b32 s1, s9, s1
; SI-NEXT: s_and_b32 s8, s8, 0x8000
; SI-NEXT: s_and_b32 s9, s0, 0x7fff
; SI-NEXT: s_or_b32 s8, s9, s8
; SI-NEXT: s_and_b32 s9, s16, 0x8000
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: s_or_b32 s0, s0, s9
; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_or_b32 s0, s8, s0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s16, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: v_mov_b32_e32 v1, s15
; VI-NEXT: v_bfi_b32 v0, s16, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, s6
; VI-NEXT: v_mov_b32_e32 v2, s14
; VI-NEXT: v_bfi_b32 v1, s16, v1, v2
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s13
; VI-NEXT: v_bfi_b32 v2, s16, v2, v3
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: v_mov_b32_e32 v4, s12
; VI-NEXT: v_bfi_b32 v3, s16, v3, v4
; VI-NEXT: v_mov_b32_e32 v4, s3
; VI-NEXT: v_mov_b32_e32 v5, s11
; VI-NEXT: v_bfi_b32 v4, s16, v4, v5
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: v_mov_b32_e32 v6, s10
; VI-NEXT: v_bfi_b32 v5, s16, v5, v6
; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_mov_b32_e32 v7, s9
; VI-NEXT: v_bfi_b32 v6, s16, v6, v7
; VI-NEXT: v_mov_b32_e32 v7, s0
; VI-NEXT: v_mov_b32_e32 v8, s8
; VI-NEXT: v_bfi_b32 v7, s16, v7, v8
; VI-NEXT: v_readfirstlane_b32 s0, v7
; VI-NEXT: v_readfirstlane_b32 s1, v6
; VI-NEXT: v_readfirstlane_b32 s2, v5
; VI-NEXT: v_readfirstlane_b32 s3, v4
; VI-NEXT: v_readfirstlane_b32 s4, v3
; VI-NEXT: v_readfirstlane_b32 s5, v2
; VI-NEXT: v_readfirstlane_b32 s6, v1
; VI-NEXT: v_readfirstlane_b32 s7, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v16f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mov_b32 s16, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s7
; GFX9-NEXT: v_mov_b32_e32 v1, s15
; GFX9-NEXT: v_bfi_b32 v0, s16, v0, v1
; GFX9-NEXT: v_mov_b32_e32 v1, s6
; GFX9-NEXT: v_mov_b32_e32 v2, s14
; GFX9-NEXT: v_bfi_b32 v1, s16, v1, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v3, s13
; GFX9-NEXT: v_bfi_b32 v2, s16, v2, v3
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v4, s12
; GFX9-NEXT: v_bfi_b32 v3, s16, v3, v4
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v5, s11
; GFX9-NEXT: v_bfi_b32 v4, s16, v4, v5
; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: v_mov_b32_e32 v6, s10
; GFX9-NEXT: v_bfi_b32 v5, s16, v5, v6
; GFX9-NEXT: v_mov_b32_e32 v6, s1
; GFX9-NEXT: v_mov_b32_e32 v7, s9
; GFX9-NEXT: v_bfi_b32 v6, s16, v6, v7
; GFX9-NEXT: v_mov_b32_e32 v7, s0
; GFX9-NEXT: v_mov_b32_e32 v8, s8
; GFX9-NEXT: v_bfi_b32 v7, s16, v7, v8
; GFX9-NEXT: v_readfirstlane_b32 s0, v7
; GFX9-NEXT: v_readfirstlane_b32 s1, v6
; GFX9-NEXT: v_readfirstlane_b32 s2, v5
; GFX9-NEXT: v_readfirstlane_b32 s3, v4
; GFX9-NEXT: v_readfirstlane_b32 s4, v3
; GFX9-NEXT: v_readfirstlane_b32 s5, v2
; GFX9-NEXT: v_readfirstlane_b32 s6, v1
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v16f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_dual_mov_b32 v0, s15 :: v_dual_mov_b32 v1, s14
; GFX11-NEXT: v_dual_mov_b32 v2, s13 :: v_dual_mov_b32 v3, s8
; GFX11-NEXT: v_dual_mov_b32 v4, s9 :: v_dual_mov_b32 v5, s10
; GFX11-NEXT: v_dual_mov_b32 v6, s11 :: v_dual_mov_b32 v7, s12
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s7, v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, s6, v1
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, s5, v2
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, s0, v3
; GFX11-NEXT: v_bfi_b32 v4, 0x7fff7fff, s1, v4
; GFX11-NEXT: v_bfi_b32 v5, 0x7fff7fff, s2, v5
; GFX11-NEXT: v_bfi_b32 v6, 0x7fff7fff, s3, v6
; GFX11-NEXT: v_bfi_b32 v7, 0x7fff7fff, s4, v7
; GFX11-NEXT: v_readfirstlane_b32 s0, v3
; GFX11-NEXT: v_readfirstlane_b32 s1, v4
; GFX11-NEXT: v_readfirstlane_b32 s2, v5
; GFX11-NEXT: v_readfirstlane_b32 s3, v6
; GFX11-NEXT: v_readfirstlane_b32 s4, v7
; GFX11-NEXT: v_readfirstlane_b32 s5, v2
; GFX11-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-NEXT: v_readfirstlane_b32 s7, v0
; GFX11-NEXT: ; return to shader part epilog
%out = call <16 x half> @llvm.copysign.v16f16(<16 x half> %arg_mag, <16 x half> %arg_sign)
%cast = bitcast <16 x half> %out to <8 x i32>
ret <8 x i32> %cast
}
define <2 x half> @v_copysign_v2f16(<2 x half> %mag, <2 x half> %sign) {
; SI-LABEL: v_copysign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign)
ret <2 x half> %result
}
define <3 x half> @v_copysign_v3f16(<3 x half> %mag, <3 x half> %sign) {
; SI-LABEL: v_copysign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v3f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <3 x half> @llvm.copysign.v3f16(<3 x half> %mag, <3 x half> %sign)
ret <3 x half> %result
}
define <4 x half> @v_copysign_v4f16(<4 x half> %mag, <4 x half> %sign) {
; SI-LABEL: v_copysign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3
; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v1
; SI-NEXT: v_or_b32_e32 v3, v6, v3
; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
; SI-NEXT: v_bfe_u32 v1, v1, 16, 15
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v5
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_or_b32_e32 v2, v6, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v4f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x half> @llvm.copysign.v4f16(<4 x half> %mag, <4 x half> %sign)
ret <4 x half> %result
}
define <8 x half> @v_copysign_v8f16(<8 x half> %mag, <8 x half> %sign) {
; SI-LABEL: v_copysign_v8f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v7
; SI-NEXT: v_and_b32_e32 v7, 0x8000, v7
; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v3
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v6
; SI-NEXT: v_or_b32_e32 v7, v12, v7
; SI-NEXT: v_and_b32_e32 v6, 0x8000, v6
; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v2
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; SI-NEXT: v_or_b32_e32 v6, v12, v6
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v1
; SI-NEXT: v_or_b32_e32 v5, v12, v5
; SI-NEXT: v_and_b32_e32 v12, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11
; SI-NEXT: v_bfe_u32 v3, v3, 16, 15
; SI-NEXT: v_and_b32_e32 v10, 0x8000, v10
; SI-NEXT: v_bfe_u32 v2, v2, 16, 15
; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9
; SI-NEXT: v_bfe_u32 v1, v1, 16, 15
; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4
; SI-NEXT: v_or_b32_e32 v3, v3, v11
; SI-NEXT: v_or_b32_e32 v2, v2, v10
; SI-NEXT: v_or_b32_e32 v1, v1, v9
; SI-NEXT: v_or_b32_e32 v0, v0, v8
; SI-NEXT: v_or_b32_e32 v4, v12, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v4, v0
; SI-NEXT: v_or_b32_e32 v1, v5, v1
; SI-NEXT: v_or_b32_e32 v2, v6, v2
; SI-NEXT: v_or_b32_e32 v3, v7, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v4
; VI-NEXT: v_bfi_b32 v1, s4, v1, v5
; VI-NEXT: v_bfi_b32 v2, s4, v2, v6
; VI-NEXT: v_bfi_b32 v3, s4, v3, v7
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v8f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v6
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v8f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v6
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v7
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <8 x half> @llvm.copysign.v8f16(<8 x half> %mag, <8 x half> %sign)
ret <8 x half> %result
}
define <16 x half> @v_copysign_v16f16(<16 x half> %mag, <16 x half> %sign) {
; SI-LABEL: v_copysign_v16f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v16, 0x8000, v15
; SI-NEXT: v_and_b32_e32 v17, 0x7fff, v7
; SI-NEXT: v_or_b32_e32 v16, v17, v16
; SI-NEXT: v_and_b32_e32 v17, 0x8000, v14
; SI-NEXT: v_and_b32_e32 v18, 0x7fff, v6
; SI-NEXT: v_or_b32_e32 v17, v18, v17
; SI-NEXT: v_and_b32_e32 v18, 0x8000, v13
; SI-NEXT: v_and_b32_e32 v19, 0x7fff, v5
; SI-NEXT: v_or_b32_e32 v18, v19, v18
; SI-NEXT: v_and_b32_e32 v19, 0x8000, v12
; SI-NEXT: v_and_b32_e32 v20, 0x7fff, v4
; SI-NEXT: v_or_b32_e32 v19, v20, v19
; SI-NEXT: v_and_b32_e32 v20, 0x8000, v11
; SI-NEXT: v_and_b32_e32 v21, 0x7fff, v3
; SI-NEXT: v_or_b32_e32 v20, v21, v20
; SI-NEXT: v_and_b32_e32 v21, 0x8000, v10
; SI-NEXT: v_and_b32_e32 v22, 0x7fff, v2
; SI-NEXT: v_or_b32_e32 v21, v22, v21
; SI-NEXT: v_and_b32_e32 v22, 0x8000, v9
; SI-NEXT: v_and_b32_e32 v23, 0x7fff, v1
; SI-NEXT: v_or_b32_e32 v22, v23, v22
; SI-NEXT: v_and_b32_e32 v23, 0x8000, v8
; SI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshrrev_b32_e32 v14, 16, v14
; SI-NEXT: v_lshrrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshrrev_b32_e32 v12, 16, v12
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v11
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v10
; SI-NEXT: v_lshrrev_b32_e32 v9, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; SI-NEXT: v_and_b32_e32 v24, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v15, 0x8000, v15
; SI-NEXT: v_bfe_u32 v7, v7, 16, 15
; SI-NEXT: v_and_b32_e32 v14, 0x8000, v14
; SI-NEXT: v_bfe_u32 v6, v6, 16, 15
; SI-NEXT: v_and_b32_e32 v13, 0x8000, v13
; SI-NEXT: v_bfe_u32 v5, v5, 16, 15
; SI-NEXT: v_and_b32_e32 v12, 0x8000, v12
; SI-NEXT: v_bfe_u32 v4, v4, 16, 15
; SI-NEXT: v_and_b32_e32 v11, 0x8000, v11
; SI-NEXT: v_bfe_u32 v3, v3, 16, 15
; SI-NEXT: v_and_b32_e32 v10, 0x8000, v10
; SI-NEXT: v_bfe_u32 v2, v2, 16, 15
; SI-NEXT: v_and_b32_e32 v9, 0x8000, v9
; SI-NEXT: v_bfe_u32 v1, v1, 16, 15
; SI-NEXT: v_and_b32_e32 v8, 0x8000, v8
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_or_b32_e32 v7, v7, v15
; SI-NEXT: v_or_b32_e32 v6, v6, v14
; SI-NEXT: v_or_b32_e32 v5, v5, v13
; SI-NEXT: v_or_b32_e32 v4, v4, v12
; SI-NEXT: v_or_b32_e32 v3, v3, v11
; SI-NEXT: v_or_b32_e32 v2, v2, v10
; SI-NEXT: v_or_b32_e32 v1, v1, v9
; SI-NEXT: v_or_b32_e32 v0, v0, v8
; SI-NEXT: v_or_b32_e32 v23, v24, v23
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_or_b32_e32 v0, v23, v0
; SI-NEXT: v_or_b32_e32 v1, v22, v1
; SI-NEXT: v_or_b32_e32 v2, v21, v2
; SI-NEXT: v_or_b32_e32 v3, v20, v3
; SI-NEXT: v_or_b32_e32 v4, v19, v4
; SI-NEXT: v_or_b32_e32 v5, v18, v5
; SI-NEXT: v_or_b32_e32 v6, v17, v6
; SI-NEXT: v_or_b32_e32 v7, v16, v7
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v8
; VI-NEXT: v_bfi_b32 v1, s4, v1, v9
; VI-NEXT: v_bfi_b32 v2, s4, v2, v10
; VI-NEXT: v_bfi_b32 v3, s4, v3, v11
; VI-NEXT: v_bfi_b32 v4, s4, v4, v12
; VI-NEXT: v_bfi_b32 v5, s4, v5, v13
; VI-NEXT: v_bfi_b32 v6, s4, v6, v14
; VI-NEXT: v_bfi_b32 v7, s4, v7, v15
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v16f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v8
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v9
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v10
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v11
; GFX9-NEXT: v_bfi_b32 v4, s4, v4, v12
; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v13
; GFX9-NEXT: v_bfi_b32 v6, s4, v6, v14
; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v15
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v16f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v10
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v11
; GFX11-NEXT: v_bfi_b32 v4, 0x7fff7fff, v4, v12
; GFX11-NEXT: v_bfi_b32 v5, 0x7fff7fff, v5, v13
; GFX11-NEXT: v_bfi_b32 v6, 0x7fff7fff, v6, v14
; GFX11-NEXT: v_bfi_b32 v7, 0x7fff7fff, v7, v15
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <16 x half> @llvm.copysign.v16f16(<16 x half> %mag, <16 x half> %sign)
ret <16 x half> %result
}
define <32 x half> @v_copysign_v32f32(<32 x half> %mag, <32 x half> %sign) {
; SI-LABEL: v_copysign_v32f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; SI-NEXT: buffer_store_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; SI-NEXT: s_waitcnt expcnt(1)
; SI-NEXT: v_and_b32_e32 v58, 0x8000, v17
; SI-NEXT: v_lshrrev_b32_e32 v17, 16, v17
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v59, 0x7fff, v1
; SI-NEXT: v_and_b32_e32 v17, 0x8000, v17
; SI-NEXT: v_bfe_u32 v1, v1, 16, 15
; SI-NEXT: v_and_b32_e32 v38, 0x8000, v27
; SI-NEXT: v_and_b32_e32 v39, 0x7fff, v11
; SI-NEXT: v_and_b32_e32 v48, 0x8000, v26
; SI-NEXT: v_and_b32_e32 v49, 0x7fff, v10
; SI-NEXT: v_and_b32_e32 v50, 0x8000, v25
; SI-NEXT: v_and_b32_e32 v51, 0x7fff, v9
; SI-NEXT: v_and_b32_e32 v40, 0x8000, v22
; SI-NEXT: v_and_b32_e32 v41, 0x7fff, v6
; SI-NEXT: v_or_b32_e32 v1, v1, v17
; SI-NEXT: v_and_b32_e32 v52, 0x8000, v24
; SI-NEXT: v_and_b32_e32 v53, 0x7fff, v8
; SI-NEXT: v_and_b32_e32 v54, 0x8000, v23
; SI-NEXT: v_and_b32_e32 v55, 0x7fff, v7
; SI-NEXT: v_and_b32_e32 v42, 0x8000, v21
; SI-NEXT: v_and_b32_e32 v43, 0x7fff, v5
; SI-NEXT: v_and_b32_e32 v44, 0x8000, v20
; SI-NEXT: v_and_b32_e32 v45, 0x7fff, v4
; SI-NEXT: v_and_b32_e32 v46, 0x8000, v19
; SI-NEXT: v_and_b32_e32 v47, 0x7fff, v3
; SI-NEXT: v_and_b32_e32 v56, 0x8000, v18
; SI-NEXT: v_and_b32_e32 v57, 0x7fff, v2
; SI-NEXT: v_or_b32_e32 v38, v39, v38
; SI-NEXT: v_or_b32_e32 v39, v49, v48
; SI-NEXT: v_or_b32_e32 v48, v51, v50
; SI-NEXT: v_or_b32_e32 v51, v41, v40
; SI-NEXT: v_or_b32_e32 v40, v59, v58
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v49, v53, v52
; SI-NEXT: v_or_b32_e32 v50, v55, v54
; SI-NEXT: v_or_b32_e32 v52, v43, v42
; SI-NEXT: v_or_b32_e32 v53, v45, v44
; SI-NEXT: v_or_b32_e32 v54, v47, v46
; SI-NEXT: v_or_b32_e32 v55, v57, v56
; SI-NEXT: v_or_b32_e32 v1, v40, v1
; SI-NEXT: buffer_load_dword v59, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v58, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v57, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v56, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
; SI-NEXT: buffer_load_dword v31, off, s[0:3], s32
; SI-NEXT: v_and_b32_e32 v32, 0x8000, v30
; SI-NEXT: v_and_b32_e32 v33, 0x7fff, v14
; SI-NEXT: v_and_b32_e32 v34, 0x8000, v29
; SI-NEXT: v_and_b32_e32 v35, 0x7fff, v13
; SI-NEXT: v_or_b32_e32 v32, v33, v32
; SI-NEXT: v_and_b32_e32 v33, 0x8000, v16
; SI-NEXT: v_or_b32_e32 v34, v35, v34
; SI-NEXT: v_and_b32_e32 v35, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v36, 0x8000, v28
; SI-NEXT: v_and_b32_e32 v37, 0x7fff, v12
; SI-NEXT: v_or_b32_e32 v33, v35, v33
; SI-NEXT: v_lshrrev_b32_e32 v30, 16, v30
; SI-NEXT: v_lshrrev_b32_e32 v29, 16, v29
; SI-NEXT: v_lshrrev_b32_e32 v28, 16, v28
; SI-NEXT: v_lshrrev_b32_e32 v27, 16, v27
; SI-NEXT: v_lshrrev_b32_e32 v26, 16, v26
; SI-NEXT: v_lshrrev_b32_e32 v25, 16, v25
; SI-NEXT: v_lshrrev_b32_e32 v24, 16, v24
; SI-NEXT: v_lshrrev_b32_e32 v23, 16, v23
; SI-NEXT: v_lshrrev_b32_e32 v22, 16, v22
; SI-NEXT: v_lshrrev_b32_e32 v21, 16, v21
; SI-NEXT: v_lshrrev_b32_e32 v20, 16, v20
; SI-NEXT: v_lshrrev_b32_e32 v19, 16, v19
; SI-NEXT: v_lshrrev_b32_e32 v18, 16, v18
; SI-NEXT: v_lshrrev_b32_e32 v16, 16, v16
; SI-NEXT: v_or_b32_e32 v36, v37, v36
; SI-NEXT: v_and_b32_e32 v37, 0x7fff, v15
; SI-NEXT: v_bfe_u32 v15, v15, 16, 15
; SI-NEXT: v_and_b32_e32 v30, 0x8000, v30
; SI-NEXT: v_bfe_u32 v14, v14, 16, 15
; SI-NEXT: v_and_b32_e32 v29, 0x8000, v29
; SI-NEXT: v_bfe_u32 v13, v13, 16, 15
; SI-NEXT: v_and_b32_e32 v28, 0x8000, v28
; SI-NEXT: v_bfe_u32 v12, v12, 16, 15
; SI-NEXT: v_and_b32_e32 v27, 0x8000, v27
; SI-NEXT: v_bfe_u32 v11, v11, 16, 15
; SI-NEXT: v_and_b32_e32 v26, 0x8000, v26
; SI-NEXT: v_bfe_u32 v10, v10, 16, 15
; SI-NEXT: v_and_b32_e32 v25, 0x8000, v25
; SI-NEXT: v_bfe_u32 v9, v9, 16, 15
; SI-NEXT: v_and_b32_e32 v24, 0x8000, v24
; SI-NEXT: v_bfe_u32 v8, v8, 16, 15
; SI-NEXT: v_and_b32_e32 v23, 0x8000, v23
; SI-NEXT: v_bfe_u32 v7, v7, 16, 15
; SI-NEXT: v_and_b32_e32 v22, 0x8000, v22
; SI-NEXT: v_bfe_u32 v6, v6, 16, 15
; SI-NEXT: v_and_b32_e32 v21, 0x8000, v21
; SI-NEXT: v_bfe_u32 v5, v5, 16, 15
; SI-NEXT: v_and_b32_e32 v20, 0x8000, v20
; SI-NEXT: v_bfe_u32 v4, v4, 16, 15
; SI-NEXT: v_and_b32_e32 v19, 0x8000, v19
; SI-NEXT: v_bfe_u32 v3, v3, 16, 15
; SI-NEXT: v_and_b32_e32 v18, 0x8000, v18
; SI-NEXT: v_bfe_u32 v2, v2, 16, 15
; SI-NEXT: v_and_b32_e32 v16, 0x8000, v16
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_or_b32_e32 v14, v14, v30
; SI-NEXT: v_or_b32_e32 v13, v13, v29
; SI-NEXT: v_or_b32_e32 v12, v12, v28
; SI-NEXT: v_or_b32_e32 v11, v11, v27
; SI-NEXT: v_or_b32_e32 v10, v10, v26
; SI-NEXT: v_or_b32_e32 v9, v9, v25
; SI-NEXT: v_or_b32_e32 v8, v8, v24
; SI-NEXT: v_or_b32_e32 v7, v7, v23
; SI-NEXT: v_or_b32_e32 v6, v6, v22
; SI-NEXT: v_or_b32_e32 v5, v5, v21
; SI-NEXT: v_or_b32_e32 v4, v4, v20
; SI-NEXT: v_or_b32_e32 v3, v3, v19
; SI-NEXT: v_or_b32_e32 v2, v2, v18
; SI-NEXT: v_or_b32_e32 v0, v0, v16
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; SI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v35, 0x8000, v31
; SI-NEXT: v_lshrrev_b32_e32 v31, 16, v31
; SI-NEXT: v_and_b32_e32 v31, 0x8000, v31
; SI-NEXT: v_or_b32_e32 v15, v15, v31
; SI-NEXT: v_or_b32_e32 v35, v37, v35
; SI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; SI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
; SI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; SI-NEXT: v_or_b32_e32 v0, v33, v0
; SI-NEXT: v_or_b32_e32 v2, v55, v2
; SI-NEXT: v_or_b32_e32 v3, v54, v3
; SI-NEXT: v_or_b32_e32 v4, v53, v4
; SI-NEXT: v_or_b32_e32 v5, v52, v5
; SI-NEXT: v_or_b32_e32 v6, v51, v6
; SI-NEXT: v_or_b32_e32 v7, v50, v7
; SI-NEXT: v_or_b32_e32 v8, v49, v8
; SI-NEXT: v_or_b32_e32 v9, v48, v9
; SI-NEXT: v_or_b32_e32 v10, v39, v10
; SI-NEXT: v_or_b32_e32 v11, v38, v11
; SI-NEXT: v_or_b32_e32 v12, v36, v12
; SI-NEXT: v_or_b32_e32 v13, v34, v13
; SI-NEXT: v_or_b32_e32 v14, v32, v14
; SI-NEXT: v_or_b32_e32 v15, v35, v15
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v32f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v16
; VI-NEXT: buffer_load_dword v16, off, s[0:3], s32
; VI-NEXT: v_bfi_b32 v1, s4, v1, v17
; VI-NEXT: v_bfi_b32 v2, s4, v2, v18
; VI-NEXT: v_bfi_b32 v3, s4, v3, v19
; VI-NEXT: v_bfi_b32 v4, s4, v4, v20
; VI-NEXT: v_bfi_b32 v5, s4, v5, v21
; VI-NEXT: v_bfi_b32 v6, s4, v6, v22
; VI-NEXT: v_bfi_b32 v7, s4, v7, v23
; VI-NEXT: v_bfi_b32 v8, s4, v8, v24
; VI-NEXT: v_bfi_b32 v9, s4, v9, v25
; VI-NEXT: v_bfi_b32 v10, s4, v10, v26
; VI-NEXT: v_bfi_b32 v11, s4, v11, v27
; VI-NEXT: v_bfi_b32 v12, s4, v12, v28
; VI-NEXT: v_bfi_b32 v13, s4, v13, v29
; VI-NEXT: v_bfi_b32 v14, s4, v14, v30
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfi_b32 v15, s4, v15, v16
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v32f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v16
; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v17
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v18
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v19
; GFX9-NEXT: v_bfi_b32 v4, s4, v4, v20
; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v21
; GFX9-NEXT: v_bfi_b32 v6, s4, v6, v22
; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v23
; GFX9-NEXT: v_bfi_b32 v8, s4, v8, v24
; GFX9-NEXT: v_bfi_b32 v9, s4, v9, v25
; GFX9-NEXT: v_bfi_b32 v10, s4, v10, v26
; GFX9-NEXT: v_bfi_b32 v11, s4, v11, v27
; GFX9-NEXT: v_bfi_b32 v12, s4, v12, v28
; GFX9-NEXT: v_bfi_b32 v13, s4, v13, v29
; GFX9-NEXT: v_bfi_b32 v14, s4, v14, v30
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v15, s4, v15, v16
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v32f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: scratch_load_b32 v31, off, s32
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v16
; GFX11-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v17
; GFX11-NEXT: v_bfi_b32 v2, 0x7fff7fff, v2, v18
; GFX11-NEXT: v_bfi_b32 v3, 0x7fff7fff, v3, v19
; GFX11-NEXT: v_bfi_b32 v4, 0x7fff7fff, v4, v20
; GFX11-NEXT: v_bfi_b32 v5, 0x7fff7fff, v5, v21
; GFX11-NEXT: v_bfi_b32 v6, 0x7fff7fff, v6, v22
; GFX11-NEXT: v_bfi_b32 v7, 0x7fff7fff, v7, v23
; GFX11-NEXT: v_bfi_b32 v8, 0x7fff7fff, v8, v24
; GFX11-NEXT: v_bfi_b32 v9, 0x7fff7fff, v9, v25
; GFX11-NEXT: v_bfi_b32 v10, 0x7fff7fff, v10, v26
; GFX11-NEXT: v_bfi_b32 v11, 0x7fff7fff, v11, v27
; GFX11-NEXT: v_bfi_b32 v12, 0x7fff7fff, v12, v28
; GFX11-NEXT: v_bfi_b32 v13, 0x7fff7fff, v13, v29
; GFX11-NEXT: v_bfi_b32 v14, 0x7fff7fff, v14, v30
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v15, 0x7fff7fff, v15, v31
; GFX11-NEXT: s_setpc_b64 s[30:31]
%result = call <32 x half> @llvm.copysign.v32f16(<32 x half> %mag, <32 x half> %sign)
ret <32 x half> %result
}
define amdgpu_ps i32 @s_copysign_out_f32_mag_f16_sign_f32(half inreg %mag, float inreg %sign) {
; SI-LABEL: s_copysign_out_f32_mag_f16_sign_f32:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f32_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f32_mag_f16_sign_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX9-NEXT: s_brev_b32 s0, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_out_f32_mag_f16_sign_f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%mag.ext = fpext half %mag to float
%out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
%cast = bitcast float %out to i32
ret i32 %cast
}
define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f16_sign_f64(half inreg %mag, double inreg %sign) {
; SI-LABEL: s_copysign_out_f64_mag_f16_sign_f64:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f64_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: v_readfirstlane_b32 s1, v1
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f64_mag_f16_sign_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX9-NEXT: s_brev_b32 s0, -2
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_out_f64_mag_f16_sign_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
%mag.ext = fpext half %mag to double
%out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
%cast = bitcast double %out to <2 x i32>
ret <2 x i32> %cast
}
define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f32_mag_f32_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
; SI-NEXT: s_brev_b32 s1, -2
; SI-NEXT: v_mov_b32_e32 v1, s0
; SI-NEXT: v_bfi_b32 v0, s1, v1, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f32_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s1
; VI-NEXT: s_brev_b32 s1, -2
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_bfi_b32 v0, s1, v1, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f32_mag_f32_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s1
; GFX9-NEXT: s_brev_b32 s1, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_f32_mag_f32_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_f32_mag_f32_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.ext = fpext half %sign to float
%out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
%cast = bitcast float %out to i32
ret i32 %cast
}
define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f64_mag_f64_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
; SI-NEXT: v_readfirstlane_b32 s1, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f64_mag_f64_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s2
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_bfi_b32 v0, s2, v1, v0
; VI-NEXT: v_readfirstlane_b32 s1, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f64_mag_f64_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_f64_mag_f64_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_f64_mag_f64_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.ext = fpext half %sign to double
%out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
%cast = bitcast double %out to <2 x i32>
ret <2 x i32> %cast
}
define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f32:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s1, s1, 0x80000000
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: s_lshr_b32 s1, s1, 16
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f16_mag_f16_sign_f32:
; VI: ; %bb.0:
; VI-NEXT: v_lshrrev_b32_e64 v0, 16, s1
; VI-NEXT: s_movk_i32 s1, 0x7fff
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_bfi_b32 v0, s1, v1, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f16_mag_f16_sign_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s1
; GFX9-NEXT: s_movk_i32 s1, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f16_sign_f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc float %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
%cast = bitcast half %out to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, double inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f64:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s1, s2, 0x80000000
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: s_lshr_b32 s1, s1, 16
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f16_mag_f16_sign_f64:
; VI: ; %bb.0:
; VI-NEXT: v_lshrrev_b32_e64 v0, 16, s2
; VI-NEXT: s_movk_i32 s1, 0x7fff
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_bfi_b32 v0, s1, v1, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f16_mag_f16_sign_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshrrev_b32_e64 v0, 16, s2
; GFX9-NEXT: s_movk_i32 s1, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f16_sign_f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e64 v1, 16, s2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f16_sign_f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e64 v0, 16, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, s0, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc double %sign to half
%out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
%cast = bitcast half %out to i16
ret i16 %cast
}
define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f32_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
; SI-NEXT: s_and_b32 s0, s1, 0xffff8000
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_or_b32_e32 v0, s0, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f16_mag_f32_sign_f16:
; VI: ; %bb.0:
; VI-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-NEXT: s_movk_i32 s0, 0x7fff
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_f16_mag_f32_sign_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX9-NEXT: s_movk_i32 s0, 0x7fff
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f32_sign_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v1
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f32_sign_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, s1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%mag.trunc = fptrunc float %mag to half
%out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
%cast = bitcast half %out to i16
ret i16 %cast
}
define <2 x float> @v_copysign_out_v2f32_mag_v2f16_sign_v2f32(<2 x half> %mag, <2 x float> %sign) {
; SI-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: v_bfi_b32 v1, s4, v3, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v3, v0
; VI-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v3, v1
; VI-NEXT: v_bfi_b32 v1, s4, v4, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-NEXT: v_cvt_f32_f16_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v3, v1
; GFX9-NEXT: v_bfi_b32 v1, s4, v4, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v4, v0.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v3, v1
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v4, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v3, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.ext = fpext <2 x half> %mag to <2 x float>
%out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag.ext, <2 x float> %sign)
ret <2 x float> %out
}
define <2 x float> @v_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float> %mag, <2 x half> %sign) {
; SI-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <2 x half> %sign to <2 x float>
%out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign.ext)
ret <2 x float> %out
}
define <2 x double> @v_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> %mag, <2 x half> %sign) {
; SI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v4
; SI-NEXT: v_bfi_b32 v3, s4, v3, v5
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; VI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: v_bfi_b32 v1, s4, v1, v5
; VI-NEXT: v_bfi_b32 v3, s4, v3, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 16, v4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <2 x half> %sign to <2 x double>
%out = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign.ext)
ret <2 x double> %out
}
define <2 x half> @v_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> %mag, <2 x half> %sign) {
; SI-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v3
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc <2 x float> %mag to <2 x half>
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign)
ret <2 x half> %out
}
define <2 x half> @v_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> %mag, <2 x half> %sign) {
; SI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v3
; SI-NEXT: v_or_b32_e32 v2, v7, v2
; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v3, v3, 20, 11
; SI-NEXT: s_movk_i32 s4, 0x3f1
; SI-NEXT: v_or_b32_e32 v2, v6, v2
; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v3
; SI-NEXT: v_or_b32_e32 v6, 0x1000, v2
; SI-NEXT: v_med3_i32 v7, v7, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v8, v7, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v8
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6
; SI-NEXT: s_movk_i32 s5, 0xfc10
; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3
; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v3
; SI-NEXT: v_or_b32_e32 v6, v8, v6
; SI-NEXT: v_or_b32_e32 v7, v2, v7
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; SI-NEXT: v_and_b32_e32 v7, 7, v6
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v7, v7, v8
; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; SI-NEXT: v_mov_b32_e32 v7, 0x7c00
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; SI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; SI-NEXT: v_mov_b32_e32 v8, 0x7e00
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_movk_i32 s6, 0x40f
; SI-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; SI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v1
; SI-NEXT: v_or_b32_e32 v0, v6, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v1, v1, 20, 11
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v1
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; SI-NEXT: v_med3_i32 v6, v6, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v9, v6, v3
; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v9
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v3
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1
; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v1
; SI-NEXT: v_or_b32_e32 v3, v9, v3
; SI-NEXT: v_or_b32_e32 v6, v0, v6
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; SI-NEXT: v_and_b32_e32 v6, 7, v3
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v6, v6, v9
; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v6
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v4
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v2
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v5
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v6, 0x1ff, v3
; VI-NEXT: v_or_b32_e32 v2, v6, v2
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v3, v3, 20, 11
; VI-NEXT: s_movk_i32 s4, 0x3f1
; VI-NEXT: v_or_b32_e32 v2, v5, v2
; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v3
; VI-NEXT: v_or_b32_e32 v5, 0x1000, v2
; VI-NEXT: v_med3_i32 v6, v6, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v5
; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v5
; VI-NEXT: s_movk_i32 s5, 0xfc10
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3
; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v3
; VI-NEXT: v_or_b32_e32 v5, v7, v5
; VI-NEXT: v_or_b32_e32 v6, v2, v6
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; VI-NEXT: v_and_b32_e32 v6, 7, v5
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v6, v6, v7
; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v6
; VI-NEXT: v_mov_b32_e32 v6, 0x7c00
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; VI-NEXT: v_mov_b32_e32 v7, 0x7e00
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: s_movk_i32 s6, 0x40f
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v1
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v1
; VI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; VI-NEXT: v_med3_i32 v5, v5, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v3
; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1
; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v1
; VI-NEXT: v_or_b32_e32 v3, v8, v3
; VI-NEXT: v_or_b32_e32 v5, v0, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; VI-NEXT: v_and_b32_e32 v5, 7, v3
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v5, v5, v8
; VI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; VI-NEXT: v_add_u32_e32 v3, vcc, v3, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v2
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x1ff
; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT: s_movk_i32 s5, 0xffe
; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 11
; GFX9-NEXT: v_and_or_b32 v0, v5, s5, v0
; GFX9-NEXT: v_sub_u32_e32 v7, 0x3f1, v6
; GFX9-NEXT: v_or_b32_e32 v5, 0x1000, v0
; GFX9-NEXT: v_med3_i32 v7, v7, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v8, v7, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v7, v7, v8
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v6, 0xfffffc10, v6
; GFX9-NEXT: v_or_b32_e32 v5, v8, v5
; GFX9-NEXT: v_lshl_or_b32 v7, v6, 12, v0
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6
; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
; GFX9-NEXT: v_and_b32_e32 v7, 7, v5
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v7, v7, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; GFX9-NEXT: v_add_u32_e32 v5, v5, v7
; GFX9-NEXT: v_mov_b32_e32 v7, 0x7c00
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6
; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
; GFX9-NEXT: v_mov_b32_e32 v8, 0x7e00
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: s_movk_i32 s6, 0x40f
; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v8, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_mov_b32 s7, 0x8000
; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3
; GFX9-NEXT: v_bfe_u32 v5, v3, 20, 11
; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1
; GFX9-NEXT: v_sub_u32_e32 v6, 0x3f1, v5
; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1
; GFX9-NEXT: v_med3_i32 v6, v6, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v9, v6, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v6, v6, v9
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5
; GFX9-NEXT: v_or_b32_e32 v2, v9, v2
; GFX9-NEXT: v_lshl_or_b32 v6, v5, 12, v1
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; GFX9-NEXT: v_and_b32_e32 v6, 7, v2
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v6, v6, v9
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v3, 20, 11
; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v1, 20, 11
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v5, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v8, v0
; GFX11-TRUE16-NEXT: v_med3_i32 v8, v9, 0, 13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x1000, v2
; GFX11-TRUE16-NEXT: v_med3_i32 v5, v5, 0, 13
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, 0x1000, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, v8, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v5, v10
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v9, v6, 12, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v10
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, 0xfffffc10, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v12, v5
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v7, 12, v0
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v5, v10, v5 :: v_dual_mov_b32 v10, 0x7e00
; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 7, v8
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 7, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v9
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, 0
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v3, v8, v9
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v13
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v10 :: v_dual_add_nc_u32 v5, v5, v11
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v7
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v12, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc_lo
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v12, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX11-FAKE16-NEXT: v_bfe_u32 v6, v1, 20, 11
; GFX11-FAKE16-NEXT: v_bfe_u32 v7, v3, 20, 11
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 8, v3
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v13, 0x7e00
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v6, 0xfffffc10, v6
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v5, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v7
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v8, v2
; GFX11-FAKE16-NEXT: v_med3_i32 v8, v9, 0, 13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, 0x1000, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_med3_i32 v5, v5, 0, 13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, 0x1000, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v8, v9
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, v5, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, v8, v11
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v9
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v9, v6, 12, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v8, v11, v8
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v6
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v7, 0xfffffc10, v7
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v12, v5
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v10, v7, 12, v2
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v7
; GFX11-FAKE16-NEXT: v_and_b32_e32 v9, 7, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v10, 7, v5
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v9
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v9, v11
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, v8, v9
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v10, v10, v12
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v6
; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v8, 0x7c00, v8 :: v_dual_add_nc_u32 v5, v5, v10
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc <2 x double> %mag to <2 x half>
%result = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign)
ret <2 x half> %result
}
define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> %mag, <2 x float> %sign) {
; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v0
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_sdwa v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.h, v2
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <2 x float> %sign to <2 x half>
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc)
ret <2 x half> %out
}
define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2 x double> %sign) {
; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v2
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v0
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v4
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v4
; VI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v1, v4, v2, s4
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v4.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v4, v2, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v1
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <2 x double> %sign to <2 x half>
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc)
ret <2 x half> %out
}
define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f16_sign_v2f32(<2 x half> inreg %mag, <2 x float> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s3, s0, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_bfi_b32 v0, s0, v0, v2
; SI-NEXT: v_mov_b32_e32 v2, s2
; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s3, s0, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: s_brev_b32 s0, -2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_bfi_b32 v0, s0, v0, v2
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_bfi_b32 v1, s0, v1, v2
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: v_readfirstlane_b32 s1, v1
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s3, s0, 16
; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, s3
; GFX9-NEXT: s_brev_b32 s0, -2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_bfi_b32 v1, s0, v1, v2
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_out_v2f32_mag_v2f16_sign_v2f32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s3, s0, 16
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, s1
; GFX11-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, s2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-NEXT: ; return to shader part epilog
%mag.ext = fpext <2 x half> %mag to <2 x float>
%out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag.ext, <2 x float> %sign)
%cast = bitcast <2 x float> %out to <2 x i32>
ret <2 x i32> %cast
}
define amdgpu_ps <2 x i32> @s_copysign_out_v2f32_mag_v2f32_sign_v2f16(<2 x float> inreg %mag, <2 x half> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s3, s2, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
; SI-NEXT: s_brev_b32 s2, -2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: v_bfi_b32 v0, s2, v2, v0
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_bfi_b32 v1, s2, v2, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s2
; VI-NEXT: s_brev_b32 s2, -2
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_bfi_b32 v0, s2, v1, v0
; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s3
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_bfi_b32 v1, s2, v2, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: v_readfirstlane_b32 s1, v1
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s3, s2, 16
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s2
; GFX9-NEXT: s_brev_b32 s2, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_bfi_b32 v0, s2, v1, v0
; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_v2f32_mag_v2f32_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_lshr_b32 s3, s2, 16
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s2
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v1, 16, s3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s0, v0
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.ext = fpext <2 x half> %sign to <2 x float>
%out = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign.ext)
%cast = bitcast <2 x float> %out to <2 x i32>
ret <2 x i32> %cast
}
define amdgpu_ps <4 x i32> @s_copysign_out_v2f64_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_mov_b32_e32 v2, s1
; SI-NEXT: v_bfi_b32 v0, s4, v2, v0
; SI-NEXT: v_mov_b32_e32 v2, s3
; SI-NEXT: v_bfi_b32 v1, s4, v2, v1
; SI-NEXT: v_readfirstlane_b32 s1, v0
; SI-NEXT: v_readfirstlane_b32 s3, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: v_lshlrev_b32_e64 v0, 16, s4
; VI-NEXT: s_brev_b32 s5, -2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s1, s4, 16
; VI-NEXT: v_bfi_b32 v0, s5, v1, v0
; VI-NEXT: v_lshlrev_b32_e64 v1, 16, s1
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_bfi_b32 v1, s5, v2, v1
; VI-NEXT: v_readfirstlane_b32 s1, v0
; VI-NEXT: v_readfirstlane_b32 s3, v1
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b32_e64 v0, 16, s4
; GFX9-NEXT: s_brev_b32 s5, -2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
; GFX9-NEXT: v_bfi_b32 v0, s5, v1, v0
; GFX9-NEXT: v_lshlrev_b32_e64 v1, 16, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s3
; GFX9-NEXT: v_bfi_b32 v1, s5, v2, v1
; GFX9-NEXT: v_readfirstlane_b32 s1, v0
; GFX9-NEXT: v_readfirstlane_b32 s3, v1
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_lshr_b32 s5, s4, 16
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_v2f64_mag_v2f64_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v0, 16, s4
; GFX11-FAKE16-NEXT: s_lshr_b32 s4, s4, 16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e64 v1, 16, s4
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, s1, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, s3, v1
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s1, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s3, v1
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.ext = fpext <2 x half> %sign to <2 x double>
%out = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign.ext)
%cast = bitcast <2 x double> %out to <4 x i32>
ret <4 x i32> %cast
}
define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f32_sign_v2f16(<2 x float> inreg %mag, <2 x half> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f16_f32_e32 v1, s1
; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
; SI-NEXT: s_lshr_b32 s0, s2, 16
; SI-NEXT: s_and_b32 s0, s0, 0x8000
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: s_and_b32 s1, s2, 0x8000
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_or_b32_e32 v1, s0, v1
; SI-NEXT: v_or_b32_e32 v0, s1, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cvt_f16_f32_e32 v1, s0
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: s_mov_b32 s0, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_bfi_b32 v0, s0, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s0
; GFX9-NEXT: s_mov_b32 s0, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s2
; GFX9-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, s1
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, s2
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_v2f16_mag_v2f32_sign_v2f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, s2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%mag.trunc = fptrunc <2 x float> %mag to <2 x half>
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign)
%cast = bitcast <2 x half> %out to i32
ret i32 %cast
}
define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inreg %mag, <2 x half> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_lshr_b32 s6, s3, 8
; SI-NEXT: s_lshr_b32 s5, s4, 16
; SI-NEXT: s_and_b32 s8, s6, 0xffe
; SI-NEXT: s_and_b32 s6, s3, 0x1ff
; SI-NEXT: s_or_b32 s2, s6, s2
; SI-NEXT: s_cselect_b64 s[6:7], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; SI-NEXT: s_bfe_u32 s3, s3, 0xb0014
; SI-NEXT: v_readfirstlane_b32 s2, v0
; SI-NEXT: s_sub_i32 s7, 0x3f1, s3
; SI-NEXT: s_or_b32 s2, s8, s2
; SI-NEXT: v_med3_i32 v0, s7, 0, 13
; SI-NEXT: s_or_b32 s6, s2, 0x1000
; SI-NEXT: v_readfirstlane_b32 s7, v0
; SI-NEXT: s_lshr_b32 s8, s6, s7
; SI-NEXT: s_lshl_b32 s7, s8, s7
; SI-NEXT: s_cmp_lg_u32 s7, s6
; SI-NEXT: s_cselect_b32 s6, 1, 0
; SI-NEXT: s_addk_i32 s3, 0xfc10
; SI-NEXT: s_lshl_b32 s7, s3, 12
; SI-NEXT: s_or_b32 s6, s8, s6
; SI-NEXT: s_or_b32 s7, s2, s7
; SI-NEXT: s_cmp_lt_i32 s3, 1
; SI-NEXT: s_cselect_b32 s6, s6, s7
; SI-NEXT: s_and_b32 s7, s6, 7
; SI-NEXT: s_cmp_gt_i32 s7, 5
; SI-NEXT: s_cselect_b32 s8, 1, 0
; SI-NEXT: s_cmp_eq_u32 s7, 3
; SI-NEXT: s_cselect_b32 s7, 1, 0
; SI-NEXT: s_or_b32 s7, s7, s8
; SI-NEXT: s_lshr_b32 s6, s6, 2
; SI-NEXT: s_add_i32 s6, s6, s7
; SI-NEXT: s_cmp_lt_i32 s3, 31
; SI-NEXT: s_cselect_b32 s6, s6, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s2, 0
; SI-NEXT: s_movk_i32 s7, 0x7e00
; SI-NEXT: s_cselect_b32 s2, s7, 0x7c00
; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f
; SI-NEXT: s_cselect_b32 s6, s2, s6
; SI-NEXT: s_lshr_b32 s2, s1, 8
; SI-NEXT: s_and_b32 s8, s2, 0xffe
; SI-NEXT: s_and_b32 s2, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s2, s0
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; SI-NEXT: s_bfe_u32 s1, s1, 0xb0014
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: s_sub_i32 s3, 0x3f1, s1
; SI-NEXT: s_or_b32 s0, s8, s0
; SI-NEXT: v_med3_i32 v0, s3, 0, 13
; SI-NEXT: s_or_b32 s2, s0, 0x1000
; SI-NEXT: v_readfirstlane_b32 s3, v0
; SI-NEXT: s_lshr_b32 s8, s2, s3
; SI-NEXT: s_lshl_b32 s3, s8, s3
; SI-NEXT: s_cmp_lg_u32 s3, s2
; SI-NEXT: s_cselect_b32 s2, 1, 0
; SI-NEXT: s_addk_i32 s1, 0xfc10
; SI-NEXT: s_lshl_b32 s3, s1, 12
; SI-NEXT: s_or_b32 s2, s8, s2
; SI-NEXT: s_or_b32 s3, s0, s3
; SI-NEXT: s_cmp_lt_i32 s1, 1
; SI-NEXT: s_cselect_b32 s2, s2, s3
; SI-NEXT: s_and_b32 s3, s2, 7
; SI-NEXT: s_cmp_gt_i32 s3, 5
; SI-NEXT: s_cselect_b32 s8, 1, 0
; SI-NEXT: s_cmp_eq_u32 s3, 3
; SI-NEXT: s_cselect_b32 s3, 1, 0
; SI-NEXT: s_or_b32 s3, s3, s8
; SI-NEXT: s_lshr_b32 s2, s2, 2
; SI-NEXT: s_add_i32 s2, s2, s3
; SI-NEXT: s_cmp_lt_i32 s1, 31
; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_cselect_b32 s0, s7, 0x7c00
; SI-NEXT: s_cmpk_eq_i32 s1, 0x40f
; SI-NEXT: s_cselect_b32 s0, s0, s2
; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: s_and_b32 s1, s4, 0x8000
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: s_and_b32 s1, s6, 0x7fff
; SI-NEXT: s_and_b32 s2, s5, 0x8000
; SI-NEXT: s_or_b32 s1, s1, s2
; SI-NEXT: s_lshl_b32 s1, s1, 16
; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_lshr_b32 s5, s3, 8
; VI-NEXT: s_and_b32 s5, s5, 0xffe
; VI-NEXT: s_and_b32 s6, s3, 0x1ff
; VI-NEXT: s_or_b32 s2, s6, s2
; VI-NEXT: s_cselect_b64 s[6:7], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; VI-NEXT: s_bfe_u32 s3, s3, 0xb0014
; VI-NEXT: v_readfirstlane_b32 s2, v0
; VI-NEXT: s_sub_i32 s6, 0x3f1, s3
; VI-NEXT: s_or_b32 s2, s5, s2
; VI-NEXT: v_med3_i32 v0, s6, 0, 13
; VI-NEXT: s_or_b32 s5, s2, 0x1000
; VI-NEXT: v_readfirstlane_b32 s6, v0
; VI-NEXT: s_lshr_b32 s7, s5, s6
; VI-NEXT: s_lshl_b32 s6, s7, s6
; VI-NEXT: s_cmp_lg_u32 s6, s5
; VI-NEXT: s_cselect_b32 s5, 1, 0
; VI-NEXT: s_addk_i32 s3, 0xfc10
; VI-NEXT: s_lshl_b32 s6, s3, 12
; VI-NEXT: s_or_b32 s5, s7, s5
; VI-NEXT: s_or_b32 s6, s2, s6
; VI-NEXT: s_cmp_lt_i32 s3, 1
; VI-NEXT: s_cselect_b32 s5, s5, s6
; VI-NEXT: s_and_b32 s6, s5, 7
; VI-NEXT: s_cmp_gt_i32 s6, 5
; VI-NEXT: s_cselect_b32 s7, 1, 0
; VI-NEXT: s_cmp_eq_u32 s6, 3
; VI-NEXT: s_cselect_b32 s6, 1, 0
; VI-NEXT: s_or_b32 s6, s6, s7
; VI-NEXT: s_lshr_b32 s5, s5, 2
; VI-NEXT: s_add_i32 s5, s5, s6
; VI-NEXT: s_cmp_lt_i32 s3, 31
; VI-NEXT: s_cselect_b32 s5, s5, 0x7c00
; VI-NEXT: s_cmp_lg_u32 s2, 0
; VI-NEXT: s_movk_i32 s6, 0x7e00
; VI-NEXT: s_cselect_b32 s2, s6, 0x7c00
; VI-NEXT: s_cmpk_eq_i32 s3, 0x40f
; VI-NEXT: s_cselect_b32 s2, s2, s5
; VI-NEXT: s_lshl_b32 s5, s2, 16
; VI-NEXT: s_lshr_b32 s2, s1, 8
; VI-NEXT: s_and_b32 s7, s2, 0xffe
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_bfe_u32 s1, s1, 0xb0014
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: s_sub_i32 s3, 0x3f1, s1
; VI-NEXT: s_or_b32 s0, s7, s0
; VI-NEXT: v_med3_i32 v0, s3, 0, 13
; VI-NEXT: s_or_b32 s2, s0, 0x1000
; VI-NEXT: v_readfirstlane_b32 s3, v0
; VI-NEXT: s_lshr_b32 s7, s2, s3
; VI-NEXT: s_lshl_b32 s3, s7, s3
; VI-NEXT: s_cmp_lg_u32 s3, s2
; VI-NEXT: s_cselect_b32 s2, 1, 0
; VI-NEXT: s_addk_i32 s1, 0xfc10
; VI-NEXT: s_lshl_b32 s3, s1, 12
; VI-NEXT: s_or_b32 s2, s7, s2
; VI-NEXT: s_or_b32 s3, s0, s3
; VI-NEXT: s_cmp_lt_i32 s1, 1
; VI-NEXT: s_cselect_b32 s2, s2, s3
; VI-NEXT: s_and_b32 s3, s2, 7
; VI-NEXT: s_cmp_gt_i32 s3, 5
; VI-NEXT: s_cselect_b32 s7, 1, 0
; VI-NEXT: s_cmp_eq_u32 s3, 3
; VI-NEXT: s_cselect_b32 s3, 1, 0
; VI-NEXT: s_or_b32 s3, s3, s7
; VI-NEXT: s_lshr_b32 s2, s2, 2
; VI-NEXT: s_add_i32 s2, s2, s3
; VI-NEXT: s_cmp_lt_i32 s1, 31
; VI-NEXT: s_cselect_b32 s2, s2, 0x7c00
; VI-NEXT: s_cmp_lg_u32 s0, 0
; VI-NEXT: s_cselect_b32 s0, s6, 0x7c00
; VI-NEXT: s_cmpk_eq_i32 s1, 0x40f
; VI-NEXT: s_cselect_b32 s0, s0, s2
; VI-NEXT: s_and_b32 s0, s0, 0x7fff
; VI-NEXT: s_or_b32 s0, s0, s5
; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_bfi_b32 v0, s1, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshr_b32 s5, s3, 8
; GFX9-NEXT: s_and_b32 s5, s5, 0xffe
; GFX9-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX9-NEXT: s_or_b32 s2, s6, s2
; GFX9-NEXT: s_cselect_b64 s[6:7], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7]
; GFX9-NEXT: s_bfe_u32 s6, s3, 0xb0014
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: s_sub_i32 s7, 0x3f1, s6
; GFX9-NEXT: s_or_b32 s2, s5, s2
; GFX9-NEXT: v_med3_i32 v0, s7, 0, 13
; GFX9-NEXT: s_or_b32 s5, s2, 0x1000
; GFX9-NEXT: v_readfirstlane_b32 s7, v0
; GFX9-NEXT: s_lshr_b32 s8, s5, s7
; GFX9-NEXT: s_lshl_b32 s7, s8, s7
; GFX9-NEXT: s_cmp_lg_u32 s7, s5
; GFX9-NEXT: s_cselect_b32 s5, 1, 0
; GFX9-NEXT: s_addk_i32 s6, 0xfc10
; GFX9-NEXT: s_lshl_b32 s7, s6, 12
; GFX9-NEXT: s_or_b32 s5, s8, s5
; GFX9-NEXT: s_or_b32 s7, s2, s7
; GFX9-NEXT: s_cmp_lt_i32 s6, 1
; GFX9-NEXT: s_cselect_b32 s5, s5, s7
; GFX9-NEXT: s_and_b32 s7, s5, 7
; GFX9-NEXT: s_cmp_gt_i32 s7, 5
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s7, 3
; GFX9-NEXT: s_cselect_b32 s7, 1, 0
; GFX9-NEXT: s_or_b32 s7, s7, s8
; GFX9-NEXT: s_lshr_b32 s5, s5, 2
; GFX9-NEXT: s_add_i32 s5, s5, s7
; GFX9-NEXT: s_cmp_lt_i32 s6, 31
; GFX9-NEXT: s_cselect_b32 s5, s5, 0x7c00
; GFX9-NEXT: s_cmp_lg_u32 s2, 0
; GFX9-NEXT: s_movk_i32 s7, 0x7e00
; GFX9-NEXT: s_cselect_b32 s2, s7, 0x7c00
; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x40f
; GFX9-NEXT: s_cselect_b32 s2, s2, s5
; GFX9-NEXT: s_lshr_b32 s3, s3, 16
; GFX9-NEXT: s_and_b32 s3, s3, 0x8000
; GFX9-NEXT: s_or_b32 s5, s3, s2
; GFX9-NEXT: s_lshr_b32 s2, s1, 8
; GFX9-NEXT: s_and_b32 s6, s2, 0xffe
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: s_bfe_u32 s3, s1, 0xb0014
; GFX9-NEXT: s_or_b32 s0, s6, s0
; GFX9-NEXT: s_sub_i32 s6, 0x3f1, s3
; GFX9-NEXT: v_med3_i32 v0, s6, 0, 13
; GFX9-NEXT: s_or_b32 s2, s0, 0x1000
; GFX9-NEXT: v_readfirstlane_b32 s6, v0
; GFX9-NEXT: s_lshr_b32 s8, s2, s6
; GFX9-NEXT: s_lshl_b32 s6, s8, s6
; GFX9-NEXT: s_cmp_lg_u32 s6, s2
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-NEXT: s_addk_i32 s3, 0xfc10
; GFX9-NEXT: s_lshl_b32 s6, s3, 12
; GFX9-NEXT: s_or_b32 s2, s8, s2
; GFX9-NEXT: s_or_b32 s6, s0, s6
; GFX9-NEXT: s_cmp_lt_i32 s3, 1
; GFX9-NEXT: s_cselect_b32 s2, s2, s6
; GFX9-NEXT: s_and_b32 s6, s2, 7
; GFX9-NEXT: s_cmp_gt_i32 s6, 5
; GFX9-NEXT: s_cselect_b32 s8, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s6, 3
; GFX9-NEXT: s_cselect_b32 s6, 1, 0
; GFX9-NEXT: s_or_b32 s6, s6, s8
; GFX9-NEXT: s_lshr_b32 s2, s2, 2
; GFX9-NEXT: s_add_i32 s2, s2, s6
; GFX9-NEXT: s_cmp_lt_i32 s3, 31
; GFX9-NEXT: s_cselect_b32 s2, s2, 0x7c00
; GFX9-NEXT: s_cmp_lg_u32 s0, 0
; GFX9-NEXT: s_cselect_b32 s0, s7, 0x7c00
; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x40f
; GFX9-NEXT: s_cselect_b32 s0, s0, s2
; GFX9-NEXT: s_lshr_b32 s1, s1, 16
; GFX9-NEXT: s_and_b32 s1, s1, 0x8000
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s5
; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_bfi_b32 v0, s1, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s5, s3, 8
; GFX11-NEXT: s_and_b32 s6, s3, 0x1ff
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s6, s2
; GFX11-NEXT: s_cselect_b32 s2, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2
; GFX11-NEXT: s_bfe_u32 s2, s3, 0xb0014
; GFX11-NEXT: s_sub_i32 s6, 0x3f1, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_med3_i32 v1, s6, 0, 13
; GFX11-NEXT: v_readfirstlane_b32 s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s7, v1
; GFX11-NEXT: s_or_b32 s5, s5, s6
; GFX11-NEXT: s_or_b32 s6, s5, 0x1000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s8, s6, s7
; GFX11-NEXT: s_lshl_b32 s7, s8, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s7, s6
; GFX11-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-NEXT: s_addk_i32 s2, 0xfc10
; GFX11-NEXT: s_or_b32 s6, s8, s6
; GFX11-NEXT: s_lshl_b32 s7, s2, 12
; GFX11-NEXT: s_or_b32 s7, s5, s7
; GFX11-NEXT: s_cmp_lt_i32 s2, 1
; GFX11-NEXT: s_cselect_b32 s6, s6, s7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s7, s6, 7
; GFX11-NEXT: s_cmp_gt_i32 s7, 5
; GFX11-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s7, 3
; GFX11-NEXT: s_cselect_b32 s7, 1, 0
; GFX11-NEXT: s_lshr_b32 s6, s6, 2
; GFX11-NEXT: s_or_b32 s7, s7, s8
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s6, s6, s7
; GFX11-NEXT: s_cmp_lt_i32 s2, 31
; GFX11-NEXT: s_movk_i32 s7, 0x7e00
; GFX11-NEXT: s_cselect_b32 s6, s6, 0x7c00
; GFX11-NEXT: s_cmp_lg_u32 s5, 0
; GFX11-NEXT: s_cselect_b32 s5, s7, 0x7c00
; GFX11-NEXT: s_cmpk_eq_i32 s2, 0x40f
; GFX11-NEXT: s_cselect_b32 s2, s5, s6
; GFX11-NEXT: s_lshr_b32 s3, s3, 16
; GFX11-NEXT: s_lshr_b32 s5, s1, 8
; GFX11-NEXT: s_and_b32 s3, s3, 0x8000
; GFX11-NEXT: s_and_b32 s6, s1, 0x1ff
; GFX11-NEXT: s_and_b32 s5, s5, 0xffe
; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: s_or_b32 s0, s6, s0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-NEXT: s_sub_i32 s3, 0x3f1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_med3_i32 v1, s3, 0, 13
; GFX11-NEXT: v_readfirstlane_b32 s3, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: v_readfirstlane_b32 s6, v1
; GFX11-NEXT: s_or_b32 s3, s5, s3
; GFX11-NEXT: s_or_b32 s5, s3, 0x1000
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshr_b32 s8, s5, s6
; GFX11-NEXT: s_lshl_b32 s6, s8, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_lg_u32 s6, s5
; GFX11-NEXT: s_cselect_b32 s5, 1, 0
; GFX11-NEXT: s_addk_i32 s0, 0xfc10
; GFX11-NEXT: s_or_b32 s5, s8, s5
; GFX11-NEXT: s_lshl_b32 s6, s0, 12
; GFX11-NEXT: s_or_b32 s6, s3, s6
; GFX11-NEXT: s_cmp_lt_i32 s0, 1
; GFX11-NEXT: s_cselect_b32 s5, s5, s6
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s6, s5, 7
; GFX11-NEXT: s_cmp_gt_i32 s6, 5
; GFX11-NEXT: s_cselect_b32 s8, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s6, 3
; GFX11-NEXT: s_cselect_b32 s6, 1, 0
; GFX11-NEXT: s_lshr_b32 s5, s5, 2
; GFX11-NEXT: s_or_b32 s6, s6, s8
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s5, s5, s6
; GFX11-NEXT: s_cmp_lt_i32 s0, 31
; GFX11-NEXT: s_cselect_b32 s5, s5, 0x7c00
; GFX11-NEXT: s_cmp_lg_u32 s3, 0
; GFX11-NEXT: s_cselect_b32 s3, s7, 0x7c00
; GFX11-NEXT: s_cmpk_eq_i32 s0, 0x40f
; GFX11-NEXT: s_cselect_b32 s0, s3, s5
; GFX11-NEXT: s_lshr_b32 s1, s1, 16
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s1, s1, 0x8000
; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%mag.trunc = fptrunc <2 x double> %mag to <2 x half>
%result = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag.trunc, <2 x half> %sign)
%cast = bitcast <2 x half> %result to i32
ret i32 %cast
}
define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg %mag, <2 x float> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f16_f32_e32 v1, s2
; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
; SI-NEXT: s_and_b32 s1, s0, 0x7fff
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: v_or_b32_e32 v1, s0, v1
; SI-NEXT: v_or_b32_e32 v0, s1, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; VI: ; %bb.0:
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_cvt_f16_f32_e32 v1, s1
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: v_bfi_b32 v0, s1, v1, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s2
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX9-NEXT: s_mov_b32 s1, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_bfi_b32 v0, s1, v1, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, s2
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s1
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s2
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc <2 x float> %sign to <2 x half>
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc)
%cast = bitcast <2 x half> %out to i32
ret i32 %cast
}
define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg %mag, <2 x double> inreg %sign) {
; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s2, s2, 0x80000000
; SI-NEXT: s_and_b32 s1, s0, 0x7fff
; SI-NEXT: s_lshr_b32 s2, s2, 16
; SI-NEXT: s_or_b32 s1, s1, s2
; SI-NEXT: s_and_b32 s2, s4, 0x80000000
; SI-NEXT: s_lshr_b32 s2, s2, 16
; SI-NEXT: s_bfe_u32 s0, s0, 0xf0010
; SI-NEXT: s_or_b32 s0, s0, s2
; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_or_b32 s0, s1, s0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_lshl_b32 s1, s4, 16
; VI-NEXT: s_and_b32 s2, s2, 0x8000
; VI-NEXT: s_or_b32 s1, s2, s1
; VI-NEXT: s_mov_b32 s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_bfi_b32 v0, s2, v0, v1
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s2, s4
; GFX9-NEXT: s_mov_b32 s2, 0x7fff7fff
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_bfi_b32 v0, s2, v0, v1
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_pack_ll_b32_b16 s1, s2, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_mov_b32_e32 v0, s1
; GFX11-NEXT: v_bfi_b32 v0, 0x7fff7fff, s0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc <2 x double> %sign to <2 x half>
%out = call <2 x half> @llvm.copysign.v2f16(<2 x half> %mag, <2 x half> %sign.trunc)
%cast = bitcast <2 x half> %out to i32
ret i32 %cast
}
define <3 x float> @v_copysign_out_v3f32_mag_v3f16_sign_v3f32(<3 x half> %mag, <3 x float> %sign) {
; SI-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
; SI-NEXT: v_bfi_b32 v2, s4, v1, v4
; SI-NEXT: v_bfi_b32 v1, s4, v5, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v5, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v5, v2
; VI-NEXT: v_bfi_b32 v2, s4, v1, v4
; VI-NEXT: v_bfi_b32 v1, s4, v6, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v5, v2
; GFX9-NEXT: v_bfi_b32 v2, s4, v1, v4
; GFX9-NEXT: v_bfi_b32 v1, s4, v6, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v5, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v0.h
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v1.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v5, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v6, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v7, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f16_sign_v3f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v6, v4
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v5, v3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.ext = fpext <3 x half> %mag to <3 x float>
%out = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag.ext, <3 x float> %sign)
ret <3 x float> %out
}
define <3 x float> @v_copysign_out_v3f32_mag_v3f32_sign_v3f16(<3 x float> %mag, <3 x half> %sign) {
; SI-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v3
; SI-NEXT: v_bfi_b32 v2, s4, v2, v4
; SI-NEXT: v_bfi_b32 v1, s4, v1, v5
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v0, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; VI-NEXT: v_bfi_b32 v2, s4, v2, v3
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; VI-NEXT: v_bfi_b32 v1, s4, v1, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v3
; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v3.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f32_mag_v3f32_sign_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v4
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <3 x half> %sign to <3 x float>
%out = call <3 x float> @llvm.copysign.v3f32(<3 x float> %mag, <3 x float> %sign.ext)
ret <3 x float> %out
}
define <3 x double> @v_copysign_out_v3f64_mag_v3f64_sign_v3f16(<3 x double> %mag, <3 x half> %sign) {
; SI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v8, v6
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v8
; SI-NEXT: v_bfi_b32 v5, s4, v5, v7
; SI-NEXT: v_bfi_b32 v3, s4, v3, v6
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; VI-NEXT: v_bfi_b32 v1, s4, v1, v8
; VI-NEXT: v_bfi_b32 v5, s4, v5, v7
; VI-NEXT: v_bfi_b32 v3, s4, v3, v6
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v8
; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v7
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v7
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f64_mag_v3f64_sign_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v7
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <3 x half> %sign to <3 x double>
%out = call <3 x double> @llvm.copysign.v3f64(<3 x double> %mag, <3 x double> %sign.ext)
ret <3 x double> %out
}
define <3 x half> @v_copysign_out_v3f16_mag_v3f32_sign_v3f16(<3 x float> %mag, <3 x half> %sign) {
; SI-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v5, v1
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v4
; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v3
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v5
; SI-NEXT: v_or_b32_e32 v2, v3, v2
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v0, v5
; VI-NEXT: v_bfi_b32 v1, s4, v2, v4
; VI-NEXT: v_bfi_b32 v0, s4, v0, v3
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v5
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v3
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v2, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f32_sign_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v2
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v4
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc <3 x float> %mag to <3 x half>
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %mag.trunc, <3 x half> %sign)
ret <3 x half> %out
}
define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag, <3 x half> %sign) {
; SI-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v9, 0x1ff, v3
; SI-NEXT: v_or_b32_e32 v2, v9, v2
; SI-NEXT: v_lshrrev_b32_e32 v8, 8, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_and_b32_e32 v8, 0xffe, v8
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v3, v3, 20, 11
; SI-NEXT: s_movk_i32 s4, 0x3f1
; SI-NEXT: v_or_b32_e32 v2, v8, v2
; SI-NEXT: v_sub_i32_e32 v9, vcc, s4, v3
; SI-NEXT: v_or_b32_e32 v8, 0x1000, v2
; SI-NEXT: v_med3_i32 v9, v9, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v10, v9, v8
; SI-NEXT: v_lshlrev_b32_e32 v9, v9, v10
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8
; SI-NEXT: s_movk_i32 s5, 0xfc10
; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3
; SI-NEXT: v_lshlrev_b32_e32 v9, 12, v3
; SI-NEXT: v_or_b32_e32 v8, v10, v8
; SI-NEXT: v_or_b32_e32 v9, v2, v9
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; SI-NEXT: v_and_b32_e32 v9, 7, v8
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9
; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9
; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v9, v9, v10
; SI-NEXT: v_lshrrev_b32_e32 v8, 2, v8
; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; SI-NEXT: v_mov_b32_e32 v9, 0x7c00
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; SI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; SI-NEXT: v_mov_b32_e32 v10, 0x7e00
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_movk_i32 s6, 0x40f
; SI-NEXT: v_cndmask_b32_e32 v2, v9, v10, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; SI-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
; SI-NEXT: v_and_b32_e32 v8, 0x1ff, v1
; SI-NEXT: v_or_b32_e32 v0, v8, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v1, v1, 20, 11
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v1
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; SI-NEXT: v_med3_i32 v8, v8, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v11, v8, v3
; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v11
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v3
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1
; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v1
; SI-NEXT: v_or_b32_e32 v3, v11, v3
; SI-NEXT: v_or_b32_e32 v8, v0, v8
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc
; SI-NEXT: v_and_b32_e32 v8, 7, v3
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8
; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8
; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v8, v8, v11
; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v5
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v5
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v4, v5, 20, 11
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1
; SI-NEXT: v_med3_i32 v5, v5, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v8, v5, v3
; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v8
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v3
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v4, vcc, s5, v4
; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4
; SI-NEXT: v_or_b32_e32 v3, v8, v3
; SI-NEXT: v_or_b32_e32 v5, v1, v5
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4
; SI-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; SI-NEXT: v_and_b32_e32 v5, 7, v3
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5
; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v5, v5, v8
; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4
; SI-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v4
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v7
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v6
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v6
; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v9, 0x1ff, v5
; VI-NEXT: v_or_b32_e32 v4, v9, v4
; VI-NEXT: v_lshrrev_b32_e32 v8, 8, v5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: v_and_b32_e32 v8, 0xffe, v8
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v5, v5, 20, 11
; VI-NEXT: s_movk_i32 s4, 0x3f1
; VI-NEXT: v_or_b32_e32 v4, v8, v4
; VI-NEXT: v_sub_u32_e32 v9, vcc, s4, v5
; VI-NEXT: v_or_b32_e32 v8, 0x1000, v4
; VI-NEXT: v_med3_i32 v9, v9, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v10, v9, v8
; VI-NEXT: v_lshlrev_b32_e32 v9, v9, v10
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8
; VI-NEXT: s_movk_i32 s5, 0xfc10
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5
; VI-NEXT: v_lshlrev_b32_e32 v9, 12, v5
; VI-NEXT: v_or_b32_e32 v8, v10, v8
; VI-NEXT: v_or_b32_e32 v9, v4, v9
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; VI-NEXT: v_and_b32_e32 v9, 7, v8
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9
; VI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v9, v9, v10
; VI-NEXT: v_lshrrev_b32_e32 v8, 2, v8
; VI-NEXT: v_add_u32_e32 v8, vcc, v8, v9
; VI-NEXT: v_mov_b32_e32 v9, 0x7c00
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; VI-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; VI-NEXT: v_mov_b32_e32 v10, 0x7e00
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: s_movk_i32 s6, 0x40f
; VI-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; VI-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; VI-NEXT: v_and_b32_e32 v8, 0x1ff, v1
; VI-NEXT: v_or_b32_e32 v0, v8, v0
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: v_sub_u32_e32 v8, vcc, s4, v1
; VI-NEXT: v_or_b32_e32 v5, 0x1000, v0
; VI-NEXT: v_med3_i32 v8, v8, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v11, v8, v5
; VI-NEXT: v_lshlrev_b32_e32 v8, v8, v11
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1
; VI-NEXT: v_lshlrev_b32_e32 v8, 12, v1
; VI-NEXT: v_or_b32_e32 v5, v11, v5
; VI-NEXT: v_or_b32_e32 v8, v0, v8
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
; VI-NEXT: v_and_b32_e32 v8, 7, v5
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8
; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v8, v8, v11
; VI-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; VI-NEXT: v_add_u32_e32 v5, vcc, v5, v8
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; VI-NEXT: v_and_b32_e32 v5, 0x1ff, v3
; VI-NEXT: v_or_b32_e32 v2, v5, v2
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffe, v1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v3, v3, 20, 11
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_sub_u32_e32 v5, vcc, s4, v3
; VI-NEXT: v_or_b32_e32 v2, 0x1000, v1
; VI-NEXT: v_med3_i32 v5, v5, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v8, v5, v2
; VI-NEXT: v_lshlrev_b32_e32 v5, v5, v8
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3
; VI-NEXT: v_lshlrev_b32_e32 v5, 12, v3
; VI-NEXT: v_or_b32_e32 v2, v8, v2
; VI-NEXT: v_or_b32_e32 v5, v1, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
; VI-NEXT: v_and_b32_e32 v5, 7, v2
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
; VI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v5, v5, v8
; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v6
; VI-NEXT: v_bfi_b32 v1, s4, v4, v7
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x1ff
; GFX9-NEXT: v_and_or_b32 v4, v5, s4, v4
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v5
; GFX9-NEXT: s_movk_i32 s5, 0xffe
; GFX9-NEXT: v_bfe_u32 v5, v5, 20, 11
; GFX9-NEXT: v_and_or_b32 v4, v8, s5, v4
; GFX9-NEXT: v_sub_u32_e32 v9, 0x3f1, v5
; GFX9-NEXT: v_or_b32_e32 v8, 0x1000, v4
; GFX9-NEXT: v_med3_i32 v9, v9, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v10, v9, v8
; GFX9-NEXT: v_lshlrev_b32_e32 v9, v9, v10
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v9, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5
; GFX9-NEXT: v_or_b32_e32 v8, v10, v8
; GFX9-NEXT: v_lshl_or_b32 v9, v5, 12, v4
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; GFX9-NEXT: v_and_b32_e32 v9, 7, v8
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v9
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v9
; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v9, v9, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 2, v8
; GFX9-NEXT: v_add_u32_e32 v8, v8, v9
; GFX9-NEXT: v_mov_b32_e32 v9, 0x7c00
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
; GFX9-NEXT: v_mov_b32_e32 v10, 0x7e00
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: s_movk_i32 s6, 0x40f
; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v10, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX9-NEXT: v_bfe_u32 v8, v1, 20, 11
; GFX9-NEXT: v_and_or_b32 v0, v5, s5, v0
; GFX9-NEXT: v_sub_u32_e32 v11, 0x3f1, v8
; GFX9-NEXT: v_or_b32_e32 v5, 0x1000, v0
; GFX9-NEXT: v_med3_i32 v11, v11, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v12, v11, v5
; GFX9-NEXT: v_lshlrev_b32_e32 v11, v11, v12
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v11, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v8, 0xfffffc10, v8
; GFX9-NEXT: v_or_b32_e32 v5, v12, v5
; GFX9-NEXT: v_lshl_or_b32 v11, v8, 12, v0
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v8
; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc
; GFX9-NEXT: v_and_b32_e32 v11, 7, v5
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v11, v11, v12
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; GFX9-NEXT: v_add_u32_e32 v5, v5, v11
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v8
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v10, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v8
; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: s_mov_b32 s7, 0x8000
; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3
; GFX9-NEXT: v_bfe_u32 v5, v3, 20, 11
; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1
; GFX9-NEXT: v_sub_u32_e32 v8, 0x3f1, v5
; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1
; GFX9-NEXT: v_med3_i32 v8, v8, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v11, v8, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v8, v8, v11
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v8, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v5, 0xfffffc10, v5
; GFX9-NEXT: v_or_b32_e32 v2, v11, v2
; GFX9-NEXT: v_lshl_or_b32 v8, v5, 12, v1
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
; GFX9-NEXT: v_and_b32_e32 v8, 7, v2
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v8, v8, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; GFX9-NEXT: v_add_u32_e32 v2, v2, v8
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v6
; GFX9-NEXT: v_bfi_b32 v1, s4, v4, v7
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 8, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 8, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v3, 20, 11
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v5, 20, 11
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 8, v1
; GFX11-TRUE16-NEXT: v_bfe_u32 v14, v1, 20, 11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v11, 0x3f1, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v8, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v8, 0x3f1, v10
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: v_med3_i32 v11, v11, 0, 13
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v9, v2
; GFX11-TRUE16-NEXT: v_med3_i32 v8, v8, 0, 13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, 0x1000, v4
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x1000, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, v11, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v13, v0
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v13, 0x3f1, v14
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, v8, v12
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v14
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v11, v16
; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x1000, v0
; GFX11-TRUE16-NEXT: v_med3_i32 v13, v13, 0, 13
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, v8, v15
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v8, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, v13, v17
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v9
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v10, 12, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, v13, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 0xfffffc10, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v9
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v13, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v11, v5, 12, v4
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v17, 0x7e00
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 2, v8
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v12, v13
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v12, v14, 12, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v15, v13
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v11, v12, v11 :: v_dual_and_b32 v12, 7, v9
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 2, v9
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v8, v8, v13
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 7, v11
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v12
; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v15, v13
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v11, v11, v13
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v12, v16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v8, 0x7c00, v8, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v2, 0x7c00, v17 :: v_dual_add_nc_u32 v9, v9, v12
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, 0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v8, v2
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v1.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v8, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, 0x7c00, v9, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v17, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f64_sign_v3f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-FAKE16-NEXT: v_bfe_u32 v8, v5, 20, 11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v5
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v1, 20, 11
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v9, 0x3f1, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 8, v3
; GFX11-FAKE16-NEXT: v_bfe_u32 v13, v3, 20, 11
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: v_med3_i32 v9, v9, 0, 13
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v5, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v1
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, 0x1000, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v5, v0
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, 0x3f1, v10
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v9, v11
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, 0x1000, v0
; GFX11-FAKE16-NEXT: v_med3_i32 v5, v5, 0, 13
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v12, v2
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v13
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, v9, v15
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, 0xfffffc10, v13
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, v5, v14
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x1000, v2
; GFX11-FAKE16-NEXT: v_med3_i32 v12, v12, 0, 13
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v9, v11
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v5, v16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, v12, v17
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v5, v14
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v12, v12, v11
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v9, v15, v9
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v8, 0xfffffc10, v8
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v17
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v14, v8, 12, v4
; GFX11-FAKE16-NEXT: v_or_b32_e32 v5, v16, v5
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v8
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v15, v10, 12, v0
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v12
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v12, v13, 12, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v15, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v13
; GFX11-FAKE16-NEXT: v_and_b32_e32 v14, 7, v9
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 2, v9
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v14
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v14
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, v14, v12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v9, v9, v12
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v12, 0x7e00 :: v_dual_and_b32 v15, 7, v5
; GFX11-FAKE16-NEXT: v_and_b32_e32 v16, 7, v11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 2, v5
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v16
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v14, v15, v17
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, v5, v14
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v16, v18
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v5, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v0, 0x7c00, v12 :: v_dual_add_nc_u32 v11, v11, v15
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v13
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v13
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v8
; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v9, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v12, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v8
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc <3 x double> %mag to <3 x half>
%result = call <3 x half> @llvm.copysign.v3f16(<3 x half> %mag.trunc, <3 x half> %sign)
ret <3 x half> %result
}
define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f32(<3 x half> %mag, <3 x float> %sign) {
; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4
; SI-NEXT: v_or_b32_e32 v1, v1, v4
; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_or_b32_e32 v2, v4, v2
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v4, v4
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_bfi_b32 v1, s4, v1, v4
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.h, v3
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v2
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v2.l, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v2
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v2, v3
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v4
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <3 x float> %sign to <3 x half>
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %mag, <3 x half> %sign.trunc)
ret <3 x half> %out
}
define <3 x half> @v_copysign_out_v3f16_mag_v3f16_sign_v3f64(<3 x half> %mag, <3 x double> %sign) {
; SI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0x80000000, v7
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v3
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v5
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v7
; VI-NEXT: s_movk_i32 s4, 0x7fff
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; VI-NEXT: v_bfi_b32 v2, s4, v0, v2
; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; VI-NEXT: v_bfi_b32 v0, s4, v0, v3
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; VI-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v7
; GFX9-NEXT: s_movk_i32 s4, 0x7fff
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_bfi_b32 v2, s4, v0, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v3
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v0, v2, s4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fff, v2, v4
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v7
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v4
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v3f16_mag_v3f16_sign_v3f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff, v0, v2
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fff, v3, v4
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff, v1, v3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <3 x double> %sign to <3 x half>
%out = call <3 x half> @llvm.copysign.v3f16(<3 x half> %mag, <3 x half> %sign.trunc)
ret <3 x half> %out
}
define <4 x float> @v_copysign_out_v4f32_mag_v4f16_sign_v4f32(<4 x half> %mag, <4 x float> %sign) {
; SI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
; SI-NEXT: v_bfi_b32 v2, s4, v1, v4
; SI-NEXT: v_bfi_b32 v1, s4, v6, v3
; SI-NEXT: v_bfi_b32 v3, s4, v7, v5
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: v_cvt_f32_f16_e32 v7, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; VI-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v7, v2
; VI-NEXT: v_bfi_b32 v2, s4, v1, v4
; VI-NEXT: v_bfi_b32 v1, s4, v8, v3
; VI-NEXT: v_bfi_b32 v3, s4, v6, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX9-NEXT: v_cvt_f32_f16_sdwa v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v7, v2
; GFX9-NEXT: v_bfi_b32 v2, s4, v1, v4
; GFX9-NEXT: v_bfi_b32 v1, s4, v8, v3
; GFX9-NEXT: v_bfi_b32 v3, s4, v6, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v6, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v7, v0.h
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v8, v1.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v9, v1.h
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v6, v2
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v7, v3
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v8, v4
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v9, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f32_mag_v4f16_sign_v4f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v6, v6
; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v7, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v2
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v1, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v6, v3
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v7, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.ext = fpext <4 x half> %mag to <4 x float>
%out = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag.ext, <4 x float> %sign)
ret <4 x float> %out
}
define <4 x float> @v_copysign_out_v4f32_mag_v4f32_sign_v4f16(<4 x float> %mag, <4 x half> %sign) {
; SI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; SI-NEXT: v_cvt_f32_f16_e32 v7, v7
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v4
; SI-NEXT: v_bfi_b32 v2, s4, v2, v5
; SI-NEXT: v_bfi_b32 v1, s4, v1, v6
; SI-NEXT: v_bfi_b32 v3, s4, v3, v7
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_bfi_b32 v0, s4, v0, v4
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v5
; VI-NEXT: v_bfi_b32 v2, s4, v2, v4
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; VI-NEXT: v_bfi_b32 v1, s4, v1, v4
; VI-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; VI-NEXT: v_bfi_b32 v3, s4, v3, v4
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v5
; GFX9-NEXT: v_bfi_b32 v2, s4, v2, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v7
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v4
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 16, v6
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v5.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v4
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v5
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v6
; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f32_mag_v4f32_sign_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fffffff, v0, v4
; GFX11-FAKE16-NEXT: v_bfi_b32 v2, 0x7fffffff, v2, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v6
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v7
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <4 x half> %sign to <4 x float>
%out = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign.ext)
ret <4 x float> %out
}
define <4 x double> @v_copysign_out_v4f64_mag_v4f64_sign_v4f16(<4 x double> %mag, <4 x half> %sign) {
; SI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v9
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v8
; SI-NEXT: v_cvt_f32_f16_e32 v8, v8
; SI-NEXT: v_cvt_f32_f16_e32 v9, v9
; SI-NEXT: v_cvt_f32_f16_e32 v11, v11
; SI-NEXT: v_cvt_f32_f16_e32 v10, v10
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v8
; SI-NEXT: v_bfi_b32 v5, s4, v5, v9
; SI-NEXT: v_bfi_b32 v3, s4, v3, v11
; SI-NEXT: v_bfi_b32 v7, s4, v7, v10
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v8
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; VI-NEXT: s_brev_b32 s4, -2
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_bfi_b32 v3, s4, v3, v8
; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
; VI-NEXT: v_bfi_b32 v1, s4, v1, v10
; VI-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; VI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; VI-NEXT: v_bfi_b32 v5, s4, v5, v10
; VI-NEXT: v_bfi_b32 v7, s4, v7, v8
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v8
; GFX9-NEXT: s_brev_b32 s4, -2
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfi_b32 v3, s4, v3, v8
; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v9
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v10, 16, v9
; GFX9-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX9-NEXT: v_bfi_b32 v5, s4, v5, v10
; GFX9-NEXT: v_bfi_b32 v7, s4, v7, v8
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v8.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l
; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v8
; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v9
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v10
; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v11
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f64_mag_v4f64_sign_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v9
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fffffff, v1, v8
; GFX11-FAKE16-NEXT: v_bfi_b32 v5, 0x7fffffff, v5, v9
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_bfi_b32 v3, 0x7fffffff, v3, v10
; GFX11-FAKE16-NEXT: v_bfi_b32 v7, 0x7fffffff, v7, v11
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.ext = fpext <4 x half> %sign to <4 x double>
%out = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign.ext)
ret <4 x double> %out
}
define <4 x half> @v_copysign_out_v4f16_mag_v4f32_sign_v4f16(<4 x float> %mag, <4 x half> %sign) {
; SI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_or_b32_e32 v2, v2, v5
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v7
; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v6
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_or_b32_e32 v3, v3, v5
; SI-NEXT: v_or_b32_e32 v1, v1, v4
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_or_b32_e32 v1, v2, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_bfi_b32 v0, s4, v0, v4
; VI-NEXT: v_or_b32_e32 v1, v2, v3
; VI-NEXT: v_bfi_b32 v1, s4, v1, v5
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v4
; GFX9-NEXT: v_pack_b32_f16 v1, v2, v3
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.h, v3
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v1, v4
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v3, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f16_mag_v4f32_sign_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v2, v3
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v4
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc <4 x float> %mag to <4 x half>
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %mag.trunc, <4 x half> %sign)
ret <4 x half> %out
}
define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag, <4 x half> %sign) {
; SI-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v13, 0x1ff, v3
; SI-NEXT: v_or_b32_e32 v2, v13, v2
; SI-NEXT: v_lshrrev_b32_e32 v12, 8, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: v_and_b32_e32 v12, 0xffe, v12
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v3, v3, 20, 11
; SI-NEXT: s_movk_i32 s4, 0x3f1
; SI-NEXT: v_or_b32_e32 v2, v12, v2
; SI-NEXT: v_sub_i32_e32 v13, vcc, s4, v3
; SI-NEXT: v_or_b32_e32 v12, 0x1000, v2
; SI-NEXT: v_med3_i32 v13, v13, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v14, v13, v12
; SI-NEXT: v_lshlrev_b32_e32 v13, v13, v14
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v13, v12
; SI-NEXT: s_movk_i32 s5, 0xfc10
; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v3, vcc, s5, v3
; SI-NEXT: v_lshlrev_b32_e32 v13, 12, v3
; SI-NEXT: v_or_b32_e32 v12, v14, v12
; SI-NEXT: v_or_b32_e32 v13, v2, v13
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; SI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
; SI-NEXT: v_and_b32_e32 v13, 7, v12
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v13
; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v13
; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v13, v13, v14
; SI-NEXT: v_lshrrev_b32_e32 v12, 2, v12
; SI-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; SI-NEXT: v_mov_b32_e32 v13, 0x7c00
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; SI-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
; SI-NEXT: v_mov_b32_e32 v14, 0x7e00
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; SI-NEXT: s_movk_i32 s6, 0x40f
; SI-NEXT: v_cndmask_b32_e32 v2, v13, v14, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; SI-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
; SI-NEXT: v_and_b32_e32 v12, 0x1ff, v1
; SI-NEXT: v_or_b32_e32 v0, v12, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v1, v1, 20, 11
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: v_sub_i32_e32 v12, vcc, s4, v1
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; SI-NEXT: v_med3_i32 v12, v12, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v15, v12, v3
; SI-NEXT: v_lshlrev_b32_e32 v12, v12, v15
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v12, v3
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v1, vcc, s5, v1
; SI-NEXT: v_lshlrev_b32_e32 v12, 12, v1
; SI-NEXT: v_or_b32_e32 v3, v15, v3
; SI-NEXT: v_or_b32_e32 v12, v0, v12
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v12, v3, vcc
; SI-NEXT: v_and_b32_e32 v12, 7, v3
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12
; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v12, v12, v15
; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v12
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v13, v14, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; SI-NEXT: v_and_b32_e32 v3, 0x1ff, v7
; SI-NEXT: v_or_b32_e32 v3, v3, v6
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v7
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_and_b32_e32 v1, 0xffe, v1
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v6, v7, 20, 11
; SI-NEXT: v_or_b32_e32 v1, v1, v3
; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v1
; SI-NEXT: v_med3_i32 v7, v7, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v12, v7, v3
; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v12
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v3
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6
; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6
; SI-NEXT: v_or_b32_e32 v3, v12, v3
; SI-NEXT: v_or_b32_e32 v7, v1, v7
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6
; SI-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
; SI-NEXT: v_and_b32_e32 v7, 7, v3
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v7, v7, v12
; SI-NEXT: v_lshrrev_b32_e32 v3, 2, v3
; SI-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6
; SI-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_cndmask_b32_e32 v1, v13, v14, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6
; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v5
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; SI-NEXT: v_lshrrev_b32_e32 v3, 8, v5
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; SI-NEXT: v_bfe_u32 v5, v5, 20, 11
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v5
; SI-NEXT: v_or_b32_e32 v4, 0x1000, v3
; SI-NEXT: v_med3_i32 v6, v6, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v4
; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v5, vcc, s5, v5
; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v5
; SI-NEXT: v_or_b32_e32 v4, v7, v4
; SI-NEXT: v_or_b32_e32 v6, v3, v6
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; SI-NEXT: v_and_b32_e32 v6, 7, v4
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v6, v6, v7
; SI-NEXT: v_lshrrev_b32_e32 v4, 2, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; SI-NEXT: v_cndmask_b32_e32 v4, v13, v4, vcc
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
; SI-NEXT: v_cndmask_b32_e32 v3, v13, v14, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; SI-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
; SI-NEXT: v_and_b32_e32 v3, 0x7fff, v3
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v9
; SI-NEXT: v_or_b32_e32 v3, v3, v4
; SI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v11
; SI-NEXT: v_lshrrev_b32_e32 v10, 16, v8
; SI-NEXT: v_or_b32_e32 v1, v1, v4
; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v8
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v10
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v11, 0x1ff, v5
; VI-NEXT: v_or_b32_e32 v4, v11, v4
; VI-NEXT: v_lshrrev_b32_e32 v10, 8, v5
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: v_and_b32_e32 v10, 0xffe, v10
; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v5, v5, 20, 11
; VI-NEXT: s_movk_i32 s4, 0x3f1
; VI-NEXT: v_or_b32_e32 v4, v10, v4
; VI-NEXT: v_sub_u32_e32 v11, vcc, s4, v5
; VI-NEXT: v_or_b32_e32 v10, 0x1000, v4
; VI-NEXT: v_med3_i32 v11, v11, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v12, v11, v10
; VI-NEXT: v_lshlrev_b32_e32 v11, v11, v12
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v11, v10
; VI-NEXT: s_movk_i32 s5, 0xfc10
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v5, vcc, s5, v5
; VI-NEXT: v_lshlrev_b32_e32 v11, 12, v5
; VI-NEXT: v_or_b32_e32 v10, v12, v10
; VI-NEXT: v_or_b32_e32 v11, v4, v11
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
; VI-NEXT: v_and_b32_e32 v11, 7, v10
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; VI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; VI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v11, v11, v12
; VI-NEXT: v_lshrrev_b32_e32 v10, 2, v10
; VI-NEXT: v_add_u32_e32 v10, vcc, v10, v11
; VI-NEXT: v_mov_b32_e32 v11, 0x7c00
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
; VI-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
; VI-NEXT: v_mov_b32_e32 v12, 0x7e00
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; VI-NEXT: s_movk_i32 s6, 0x40f
; VI-NEXT: v_cndmask_b32_e32 v4, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v5
; VI-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
; VI-NEXT: v_and_b32_e32 v10, 0x1ff, v7
; VI-NEXT: v_or_b32_e32 v6, v10, v6
; VI-NEXT: v_lshrrev_b32_e32 v5, 8, v7
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; VI-NEXT: v_and_b32_e32 v5, 0xffe, v5
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v7, v7, 20, 11
; VI-NEXT: v_or_b32_e32 v5, v5, v6
; VI-NEXT: v_sub_u32_e32 v10, vcc, s4, v7
; VI-NEXT: v_or_b32_e32 v6, 0x1000, v5
; VI-NEXT: v_med3_i32 v10, v10, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v13, v10, v6
; VI-NEXT: v_lshlrev_b32_e32 v10, v10, v13
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v7, vcc, s5, v7
; VI-NEXT: v_lshlrev_b32_e32 v10, 12, v7
; VI-NEXT: v_or_b32_e32 v6, v13, v6
; VI-NEXT: v_or_b32_e32 v10, v5, v10
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7
; VI-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
; VI-NEXT: v_and_b32_e32 v10, 7, v6
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10
; VI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v10, v10, v13
; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v10
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7
; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; VI-NEXT: v_cndmask_b32_e32 v5, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7
; VI-NEXT: v_and_b32_e32 v7, 0x1ff, v1
; VI-NEXT: v_or_b32_e32 v0, v7, v0
; VI-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; VI-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_and_b32_e32 v6, 0xffe, v6
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v1, v1, 20, 11
; VI-NEXT: v_or_b32_e32 v0, v6, v0
; VI-NEXT: v_sub_u32_e32 v7, vcc, s4, v1
; VI-NEXT: v_or_b32_e32 v6, 0x1000, v0
; VI-NEXT: v_med3_i32 v7, v7, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v10, v7, v6
; VI-NEXT: v_lshlrev_b32_e32 v7, v7, v10
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, s5, v1
; VI-NEXT: v_lshlrev_b32_e32 v7, 12, v1
; VI-NEXT: v_or_b32_e32 v6, v10, v6
; VI-NEXT: v_or_b32_e32 v7, v0, v7
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1
; VI-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc
; VI-NEXT: v_and_b32_e32 v7, 7, v6
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
; VI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v7, v7, v10
; VI-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v7
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v1
; VI-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; VI-NEXT: v_and_b32_e32 v6, 0x1ff, v3
; VI-NEXT: v_or_b32_e32 v2, v6, v2
; VI-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffe, v1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_bfe_u32 v3, v3, 20, 11
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_sub_u32_e32 v6, vcc, s4, v3
; VI-NEXT: v_or_b32_e32 v2, 0x1000, v1
; VI-NEXT: v_med3_i32 v6, v6, 0, 13
; VI-NEXT: v_lshrrev_b32_e32 v7, v6, v2
; VI-NEXT: v_lshlrev_b32_e32 v6, v6, v7
; VI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v3, vcc, s5, v3
; VI-NEXT: v_lshlrev_b32_e32 v6, 12, v3
; VI-NEXT: v_or_b32_e32 v2, v7, v2
; VI-NEXT: v_or_b32_e32 v6, v1, v6
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; VI-NEXT: v_and_b32_e32 v6, 7, v2
; VI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
; VI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
; VI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; VI-NEXT: v_or_b32_e32 v6, v6, v7
; VI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; VI-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v11, v12, vcc
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v5
; VI-NEXT: v_and_b32_e32 v2, 0x7fff, v4
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: v_bfi_b32 v0, s4, v0, v8
; VI-NEXT: v_bfi_b32 v1, s4, v1, v9
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_movk_i32 s4, 0x1ff
; GFX9-NEXT: v_and_or_b32 v4, v5, s4, v4
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v5
; GFX9-NEXT: s_movk_i32 s5, 0xffe
; GFX9-NEXT: v_bfe_u32 v11, v5, 20, 11
; GFX9-NEXT: v_and_or_b32 v4, v10, s5, v4
; GFX9-NEXT: v_sub_u32_e32 v12, 0x3f1, v11
; GFX9-NEXT: v_or_b32_e32 v10, 0x1000, v4
; GFX9-NEXT: v_med3_i32 v12, v12, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v13, v12, v10
; GFX9-NEXT: v_lshlrev_b32_e32 v12, v12, v13
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v12, v10
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v11, 0xfffffc10, v11
; GFX9-NEXT: v_or_b32_e32 v10, v13, v10
; GFX9-NEXT: v_lshl_or_b32 v12, v11, 12, v4
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v11
; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
; GFX9-NEXT: v_and_b32_e32 v12, 7, v10
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v12
; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v12
; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v12, v12, v13
; GFX9-NEXT: v_lshrrev_b32_e32 v10, 2, v10
; GFX9-NEXT: v_add_u32_e32 v10, v10, v12
; GFX9-NEXT: v_mov_b32_e32 v12, 0x7c00
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v11
; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v10, vcc
; GFX9-NEXT: v_mov_b32_e32 v13, 0x7e00
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: s_movk_i32 s6, 0x40f
; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v13, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11
; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX9-NEXT: s_mov_b32 s7, 0x8000
; GFX9-NEXT: v_and_or_b32 v4, v5, s7, v4
; GFX9-NEXT: v_and_or_b32 v5, v7, s4, v6
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v7
; GFX9-NEXT: v_bfe_u32 v10, v7, 20, 11
; GFX9-NEXT: v_and_or_b32 v5, v6, s5, v5
; GFX9-NEXT: v_sub_u32_e32 v11, 0x3f1, v10
; GFX9-NEXT: v_or_b32_e32 v6, 0x1000, v5
; GFX9-NEXT: v_med3_i32 v11, v11, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v14, v11, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v11, v11, v14
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v11, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v10, 0xfffffc10, v10
; GFX9-NEXT: v_or_b32_e32 v6, v14, v6
; GFX9-NEXT: v_lshl_or_b32 v11, v10, 12, v5
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v10
; GFX9-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v11, 7, v6
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v11
; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v11
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v11, v11, v14
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; GFX9-NEXT: v_add_u32_e32 v6, v6, v11
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v10
; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
; GFX9-NEXT: v_cndmask_b32_e32 v5, v12, v13, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v10
; GFX9-NEXT: v_and_or_b32 v0, v1, s4, v0
; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v7
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_and_or_b32 v5, v6, s7, v5
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 8, v1
; GFX9-NEXT: v_bfe_u32 v7, v1, 20, 11
; GFX9-NEXT: v_and_or_b32 v0, v6, s5, v0
; GFX9-NEXT: v_sub_u32_e32 v10, 0x3f1, v7
; GFX9-NEXT: v_or_b32_e32 v6, 0x1000, v0
; GFX9-NEXT: v_med3_i32 v10, v10, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v11, v10, v6
; GFX9-NEXT: v_lshlrev_b32_e32 v10, v10, v11
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v10, v6
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v7, 0xfffffc10, v7
; GFX9-NEXT: v_or_b32_e32 v6, v11, v6
; GFX9-NEXT: v_lshl_or_b32 v10, v7, 12, v0
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7
; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc
; GFX9-NEXT: v_and_b32_e32 v10, 7, v6
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v10
; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v10
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v10, v10, v11
; GFX9-NEXT: v_lshrrev_b32_e32 v6, 2, v6
; GFX9-NEXT: v_add_u32_e32 v6, v6, v10
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7
; GFX9-NEXT: v_cndmask_b32_e32 v6, v12, v6, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v13, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_and_or_b32 v0, v1, s7, v0
; GFX9-NEXT: v_and_or_b32 v1, v3, s4, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v3
; GFX9-NEXT: v_bfe_u32 v6, v3, 20, 11
; GFX9-NEXT: v_and_or_b32 v1, v2, s5, v1
; GFX9-NEXT: v_sub_u32_e32 v7, 0x3f1, v6
; GFX9-NEXT: v_or_b32_e32 v2, 0x1000, v1
; GFX9-NEXT: v_med3_i32 v7, v7, 0, 13
; GFX9-NEXT: v_lshrrev_b32_e32 v10, v7, v2
; GFX9-NEXT: v_lshlrev_b32_e32 v7, v7, v10
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, v7, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: v_add_u32_e32 v6, 0xfffffc10, v6
; GFX9-NEXT: v_or_b32_e32 v2, v10, v2
; GFX9-NEXT: v_lshl_or_b32 v7, v6, 12, v1
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6
; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
; GFX9-NEXT: v_and_b32_e32 v7, 7, v2
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
; GFX9-NEXT: v_or_b32_e32 v7, v7, v10
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 2, v2
; GFX9-NEXT: v_add_u32_e32 v2, v2, v7
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6
; GFX9-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-NEXT: v_cndmask_b32_e32 v1, v12, v13, vcc
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_and_or_b32 v1, v2, s7, v1
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4
; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff
; GFX9-NEXT: v_perm_b32 v1, v5, v4, s4
; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v8
; GFX9-NEXT: v_bfi_b32 v1, s5, v1, v9
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0x1ff, v7, v6
; GFX11-TRUE16-NEXT: v_bfe_u32 v10, v7, 20, 11
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 8, v7
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v10
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v5
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 20, 11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v3
; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v3, 20, 11
; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 20, 11
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v7.h
; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0xffe, v11, v6
; GFX11-TRUE16-NEXT: v_med3_i32 v11, v12, 0, 13
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, 0x1000, v6
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, v11, v12
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v11, v14
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v12
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 8, v1
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v11
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v14, 0x3f1, v17
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v21, v10, 12, v6
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0xffe, v16, v4
; GFX11-TRUE16-NEXT: v_med3_i32 v14, v14, 0, 13
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v21, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v13, 0x7e00
; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x1000, v4
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v21, 0x3f1, v19
; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 7, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11
; GFX11-TRUE16-NEXT: v_med3_i32 v21, v21, 0, 13
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffe, v18, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, v14, v22
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0xffe, v12, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v16
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v16, v23
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v6, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v16
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v12, v14, v18
; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, 0x1000, v2
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v12, v22
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, v21, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v18, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v17, 0xfffffc10, v17
; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v10, 0x3f1, v20
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v6, v11, v6, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v16, v17, 12, v4
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v17
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, v21, v22
; GFX11-TRUE16-NEXT: v_med3_i32 v10, v10, 0, 13
; GFX11-TRUE16-NEXT: v_and_or_b32 v6, 0x8000, v15, v6
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v5.h
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v14
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x1000, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v19, v14, 12, v2
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, v10, v16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v22, v11
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v12
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, v10, v7
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 2, v12
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v18
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18
; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 7, v11
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v10, v16
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v18, v21
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v19
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v10
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v16, 0xfffffc10, v20
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_lshl_or_b32 v10, v16, 12, v0
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v16
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v10, v7 :: v_dual_add_nc_u32 v10, v12, v18
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v17
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v20
; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 7, v7
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 2, v7
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v4, 0x7c00, v13 :: v_dual_add_nc_u32 v11, v11, v12
; GFX11-TRUE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v18
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v18
; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v17
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v18, v12
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v5, 0x7c00, v11, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v7, v7, v10
; GFX11-TRUE16-NEXT: v_and_or_b32 v4, 0x8000, v15, v4
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v3.h
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v16
; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0x8000, v15, v2
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v7, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v1.h
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v4, v9
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v13, vcc_lo
; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v16
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo
; GFX11-TRUE16-NEXT: v_and_or_b32 v0, 0x8000, v15, v0
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f16_mag_v4f64_sign_v4f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x1ff, v5, v4
; GFX11-FAKE16-NEXT: v_bfe_u32 v10, v5, 20, 11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 8, v5
; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0x1ff, v7, v6
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 8, v7
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v12, 0x3f1, v10
; GFX11-FAKE16-NEXT: v_bfe_u32 v15, v7, 20, 11
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x1ff, v1, v0
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x1ff, v3, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v15
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0xffe, v11, v4
; GFX11-FAKE16-NEXT: v_med3_i32 v11, v12, 0, 13
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v12, 0x1000, v4
; GFX11-FAKE16-NEXT: v_and_or_b32 v6, 0xffe, v13, v6
; GFX11-FAKE16-NEXT: v_med3_i32 v13, v17, 0, 13
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, v11, v12
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, 0x1000, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v11, v11, v14
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v11, v12
; GFX11-FAKE16-NEXT: v_bfe_u32 v12, v1, 20, 11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v14, v11
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, 0xfffffc10, v10
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v14, 0xfffffc10, v15
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, v13, v17
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v18, v10, 12, v4
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v10
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v13, v13, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v18, v11, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v18, 0x3f1, v12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v19, 7, v11
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 2, v11
; GFX11-FAKE16-NEXT: v_med3_i32 v18, v18, 0, 13
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0xffe, v16, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v13, v17
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v16, v14, 12, v6
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v19
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v15, v13
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v19
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v14
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v19, v17
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc_lo
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, 0x1000, v0
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 8, v3
; GFX11-FAKE16-NEXT: v_bfe_u32 v19, v3, 20, 11
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v11, v11, v15
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v10
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, v18, v16
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0xffe, v17, v2
; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v17, 0x3f1, v19
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v19, 0xfffffc10, v19
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v11, 0x7c00, v11 :: v_dual_lshlrev_b32 v18, v18, v20
; GFX11-FAKE16-NEXT: v_and_b32_e32 v15, 7, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v21, 0x1000, v2
; GFX11-FAKE16-NEXT: v_med3_i32 v17, v17, 0, 13
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 2, v13
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, v17, v21
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v18, v16
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v18, 0x7e00 :: v_dual_lshlrev_b32 v17, v17, v23
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v15, v15, v22
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, v17, v21
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v13, v15
; GFX11-FAKE16-NEXT: v_or_b32_e32 v16, v20, v16
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v17, v23, v17
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v12, 0xfffffc10, v12
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v12, 12, v0
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v12
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v20, v16, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v4
; GFX11-FAKE16-NEXT: v_lshl_or_b32 v20, v19, 12, v2
; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v4, 0x7c00, v18 :: v_dual_and_b32 v15, 7, v16
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v19
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, v20, v17, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v10
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v11, v4, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v14
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v11, 7, v17
; GFX11-FAKE16-NEXT: v_and_or_b32 v4, 0x8000, v5, v4
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v13, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v15
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v11
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_or_b32_e32 v13, v15, v13
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v11
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 2, v16
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 2, v17
; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v13, v15, v13
; GFX11-FAKE16-NEXT: v_or_b32_e32 v11, v11, v20
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7c00, v18, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v14
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v12
; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v10, v16, v11
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7c00, v13, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v19
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v12
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc_lo
; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v19
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_or_b32 v0, 0x8000, v1, v0
; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo
; GFX11-FAKE16-NEXT: v_and_or_b32 v1, 0x8000, v3, v2
; GFX11-FAKE16-NEXT: v_and_or_b32 v2, 0x8000, v7, v6
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v1, v2, v4, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v8
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v9
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%mag.trunc = fptrunc <4 x double> %mag to <4 x half>
%result = call <4 x half> @llvm.copysign.v4f16(<4 x half> %mag.trunc, <4 x half> %sign)
ret <4 x half> %result
}
define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f32(<4 x half> %mag, <4 x float> %sign) {
; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v5, v5
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v1
; SI-NEXT: v_and_b32_e32 v4, 0x8000, v4
; SI-NEXT: v_or_b32_e32 v4, v6, v4
; SI-NEXT: v_and_b32_e32 v6, 0x7fff, v0
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
; SI-NEXT: v_bfe_u32 v1, v1, 16, 15
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v5
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_or_b32_e32 v2, v6, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
; VI-NEXT: v_cvt_f16_f32_sdwa v3, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v4, v4
; VI-NEXT: v_cvt_f16_f32_sdwa v5, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: v_or_b32_e32 v2, v4, v5
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX9-NEXT: s_mov_b32 s4, 0x7fff7fff
; GFX9-NEXT: v_pack_b32_f16 v2, v2, v3
; GFX9-NEXT: v_bfi_b32 v0, s4, v0, v2
; GFX9-NEXT: v_pack_b32_f16 v2, v4, v5
; GFX9-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v5.h, v5
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.h, v3
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v3.l, v2
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v5.l, v4
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v5
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v2, v2, v3
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v3, v4, v5
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <4 x float> %sign to <4 x half>
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %mag, <4 x half> %sign.trunc)
ret <4 x half> %out
}
define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4 x double> %sign) {
; SI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v7
; SI-NEXT: v_and_b32_e32 v2, 0x7fff, v1
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_and_b32_e32 v3, 0x80000000, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_and_b32_e32 v4, 0x7fff, v0
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v3, v4, v3
; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v9
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_bfe_u32 v1, v1, 16, 15
; SI-NEXT: v_or_b32_e32 v1, v1, v4
; SI-NEXT: v_and_b32_e32 v4, 0x80000000, v5
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; SI-NEXT: v_bfe_u32 v0, v0, 16, 15
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v3, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v5
; VI-NEXT: v_and_b32_e32 v3, 0x8000, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, v0, v2
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v9
; VI-NEXT: v_and_b32_e32 v3, 0x8000, v7
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: v_bfi_b32 v1, s4, v1, v2
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v2, v5, v3, s4
; GFX9-NEXT: s_mov_b32 s5, 0x7fff7fff
; GFX9-NEXT: v_bfi_b32 v0, s5, v0, v2
; GFX9-NEXT: v_perm_b32 v2, v9, v7, s4
; GFX9-NEXT: v_bfi_b32 v1, s5, v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v5.l
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v9.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v3
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v7
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_out_v4f16_mag_v4f16_sign_v4f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v2, v5, v3, 0x5040100
; GFX11-FAKE16-NEXT: v_perm_b32 v3, v9, v7, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-FAKE16-NEXT: v_bfi_b32 v0, 0x7fff7fff, v0, v2
; GFX11-FAKE16-NEXT: v_bfi_b32 v1, 0x7fff7fff, v1, v3
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <4 x double> %sign to <4 x half>
%out = call <4 x half> @llvm.copysign.v4f16(<4 x half> %mag, <4 x half> %sign.trunc)
ret <4 x half> %out
}
define amdgpu_ps i32 @s_copysign_f16_0_f16(half inreg %sign) {
; SI-LABEL: s_copysign_f16_0_f16:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0x8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_f16_0_f16:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s0, s0, 0x8000
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_f16_0_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_f16_0_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: ; return to shader part epilog
%op = call half @llvm.copysign.f16(half 0.0, half %sign)
%cast = bitcast half %op to i16
%zext = zext i16 %cast to i32
ret i32 %zext
}
define half @v_copysign_f16_0_f16(half %sign) {
; SI-LABEL: v_copysign_f16_0_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16_0_f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_f16_0_f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_f16_0_f16:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_f16_0_f16:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%op = call half @llvm.copysign.f16(half 0.0, half %sign)
ret half %op
}
define amdgpu_ps i32 @s_copysign_f16_0_f32(float inreg %sign) {
; SI-LABEL: s_copysign_f16_0_f32:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_f16_0_f32:
; VI: ; %bb.0:
; VI-NEXT: v_cvt_f16_f32_e32 v0, s0
; VI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_f16_0_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX9-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_f16_0_f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_f16_0_f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc float %sign to half
%op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
%cast = bitcast half %op to i16
%zext = zext i16 %cast to i32
ret i32 %zext
}
define half @v_copysign_f16_0_f32(float %sign) {
; SI-LABEL: v_copysign_f16_0_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16_0_f32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, 0x8000
; VI-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_f16_0_f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, 0x8000
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_f16_0_f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v0.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_f16_0_f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc float %sign to half
%op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
ret half %op
}
define amdgpu_ps i32 @s_copysign_f16_0_f64(double inreg %sign) {
; SI-LABEL: s_copysign_f16_0_f64:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s2, s1, 0x1ff
; SI-NEXT: s_or_b32 s0, s2, s0
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; SI-NEXT: s_lshr_b32 s0, s1, 8
; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014
; SI-NEXT: s_and_b32 s0, s0, 0xffe
; SI-NEXT: v_readfirstlane_b32 s2, v0
; SI-NEXT: s_sub_i32 s4, 0x3f1, s3
; SI-NEXT: s_or_b32 s0, s0, s2
; SI-NEXT: v_med3_i32 v0, s4, 0, 13
; SI-NEXT: s_or_b32 s2, s0, 0x1000
; SI-NEXT: v_readfirstlane_b32 s4, v0
; SI-NEXT: s_lshr_b32 s5, s2, s4
; SI-NEXT: s_lshl_b32 s4, s5, s4
; SI-NEXT: s_cmp_lg_u32 s4, s2
; SI-NEXT: s_cselect_b32 s2, 1, 0
; SI-NEXT: s_add_i32 s4, s3, 0xfffffc10
; SI-NEXT: s_lshl_b32 s3, s4, 12
; SI-NEXT: s_or_b32 s2, s5, s2
; SI-NEXT: s_or_b32 s0, s0, s3
; SI-NEXT: s_cmp_lt_i32 s4, 1
; SI-NEXT: s_cselect_b32 s0, s2, s0
; SI-NEXT: s_and_b32 s2, s0, 7
; SI-NEXT: s_cmp_gt_i32 s2, 5
; SI-NEXT: s_cselect_b32 s3, 1, 0
; SI-NEXT: s_cmp_eq_u32 s2, 3
; SI-NEXT: s_cselect_b32 s2, 1, 0
; SI-NEXT: s_or_b32 s2, s2, s3
; SI-NEXT: s_lshr_b32 s0, s0, 2
; SI-NEXT: s_add_i32 s0, s0, s2
; SI-NEXT: s_cmp_lt_i32 s4, 31
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
; SI-NEXT: s_cmpk_lg_i32 s4, 0x40f
; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
; SI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec
; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00
; SI-NEXT: s_lshr_b32 s1, s1, 16
; SI-NEXT: s_or_b32 s0, s1, s0
; SI-NEXT: s_and_b32 s0, s0, 0x8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_f16_0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_and_b32 s2, s1, 0x1ff
; VI-NEXT: s_or_b32 s0, s2, s0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: s_lshr_b32 s0, s1, 8
; VI-NEXT: s_bfe_u32 s3, s1, 0xb0014
; VI-NEXT: s_and_b32 s0, s0, 0xffe
; VI-NEXT: v_readfirstlane_b32 s2, v0
; VI-NEXT: s_sub_i32 s4, 0x3f1, s3
; VI-NEXT: s_or_b32 s0, s0, s2
; VI-NEXT: v_med3_i32 v0, s4, 0, 13
; VI-NEXT: s_or_b32 s2, s0, 0x1000
; VI-NEXT: v_readfirstlane_b32 s4, v0
; VI-NEXT: s_lshr_b32 s5, s2, s4
; VI-NEXT: s_lshl_b32 s4, s5, s4
; VI-NEXT: s_cmp_lg_u32 s4, s2
; VI-NEXT: s_cselect_b32 s2, 1, 0
; VI-NEXT: s_add_i32 s4, s3, 0xfffffc10
; VI-NEXT: s_lshl_b32 s3, s4, 12
; VI-NEXT: s_or_b32 s2, s5, s2
; VI-NEXT: s_or_b32 s0, s0, s3
; VI-NEXT: s_cmp_lt_i32 s4, 1
; VI-NEXT: s_cselect_b32 s0, s2, s0
; VI-NEXT: s_and_b32 s2, s0, 7
; VI-NEXT: s_cmp_gt_i32 s2, 5
; VI-NEXT: s_cselect_b32 s3, 1, 0
; VI-NEXT: s_cmp_eq_u32 s2, 3
; VI-NEXT: s_cselect_b32 s2, 1, 0
; VI-NEXT: s_or_b32 s2, s2, s3
; VI-NEXT: s_lshr_b32 s0, s0, 2
; VI-NEXT: s_add_i32 s0, s0, s2
; VI-NEXT: s_cmp_lt_i32 s4, 31
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
; VI-NEXT: s_cmpk_lg_i32 s4, 0x40f
; VI-NEXT: s_cselect_b64 s[4:5], -1, 0
; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; VI-NEXT: s_and_b64 s[2:3], s[2:3], exec
; VI-NEXT: s_cselect_b32 s0, s0, 0x7c00
; VI-NEXT: s_lshr_b32 s1, s1, 16
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: s_and_b32 s0, s0, 0x8000
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_f16_0_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX9-NEXT: s_or_b32 s0, s2, s0
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; GFX9-NEXT: s_lshr_b32 s0, s1, 8
; GFX9-NEXT: s_bfe_u32 s3, s1, 0xb0014
; GFX9-NEXT: s_and_b32 s0, s0, 0xffe
; GFX9-NEXT: v_readfirstlane_b32 s2, v0
; GFX9-NEXT: s_sub_i32 s4, 0x3f1, s3
; GFX9-NEXT: s_or_b32 s0, s0, s2
; GFX9-NEXT: v_med3_i32 v0, s4, 0, 13
; GFX9-NEXT: s_or_b32 s2, s0, 0x1000
; GFX9-NEXT: v_readfirstlane_b32 s4, v0
; GFX9-NEXT: s_lshr_b32 s5, s2, s4
; GFX9-NEXT: s_lshl_b32 s4, s5, s4
; GFX9-NEXT: s_cmp_lg_u32 s4, s2
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-NEXT: s_add_i32 s4, s3, 0xfffffc10
; GFX9-NEXT: s_lshl_b32 s3, s4, 12
; GFX9-NEXT: s_or_b32 s2, s5, s2
; GFX9-NEXT: s_or_b32 s0, s0, s3
; GFX9-NEXT: s_cmp_lt_i32 s4, 1
; GFX9-NEXT: s_cselect_b32 s0, s2, s0
; GFX9-NEXT: s_and_b32 s2, s0, 7
; GFX9-NEXT: s_cmp_gt_i32 s2, 5
; GFX9-NEXT: s_cselect_b32 s3, 1, 0
; GFX9-NEXT: s_cmp_eq_u32 s2, 3
; GFX9-NEXT: s_cselect_b32 s2, 1, 0
; GFX9-NEXT: s_or_b32 s2, s2, s3
; GFX9-NEXT: s_lshr_b32 s0, s0, 2
; GFX9-NEXT: s_add_i32 s0, s0, s2
; GFX9-NEXT: s_cmp_lt_i32 s4, 31
; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0
; GFX9-NEXT: s_cmpk_lg_i32 s4, 0x40f
; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3]
; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
; GFX9-NEXT: s_cselect_b32 s0, s0, 0x7c00
; GFX9-NEXT: s_lshr_b32 s1, s1, 16
; GFX9-NEXT: s_or_b32 s0, s1, s0
; GFX9-NEXT: s_and_b32 s0, s0, 0x8000
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_f16_0_f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s2, s1, 0x1ff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s0, s2, s0
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_lshr_b32 s3, s1, 8
; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: s_bfe_u32 s0, s1, 0xb0014
; GFX11-NEXT: s_and_b32 s3, s3, 0xffe
; GFX11-NEXT: s_sub_i32 s2, 0x3f1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_med3_i32 v1, s2, 0, 13
; GFX11-NEXT: v_readfirstlane_b32 s2, v0
; GFX11-NEXT: v_readfirstlane_b32 s4, v1
; GFX11-NEXT: s_or_b32 s2, s3, s2
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s3, s2, 0x1000
; GFX11-NEXT: s_lshr_b32 s5, s3, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_lshl_b32 s4, s5, s4
; GFX11-NEXT: s_cmp_lg_u32 s4, s3
; GFX11-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-NEXT: s_addk_i32 s0, 0xfc10
; GFX11-NEXT: s_or_b32 s3, s5, s3
; GFX11-NEXT: s_lshl_b32 s4, s0, 12
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_or_b32 s2, s2, s4
; GFX11-NEXT: s_cmp_lt_i32 s0, 1
; GFX11-NEXT: s_cselect_b32 s2, s3, s2
; GFX11-NEXT: s_and_b32 s3, s2, 7
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_cmp_gt_i32 s3, 5
; GFX11-NEXT: s_cselect_b32 s4, 1, 0
; GFX11-NEXT: s_cmp_eq_u32 s3, 3
; GFX11-NEXT: s_cselect_b32 s3, 1, 0
; GFX11-NEXT: s_lshr_b32 s2, s2, 2
; GFX11-NEXT: s_or_b32 s3, s3, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_add_i32 s2, s2, s3
; GFX11-NEXT: s_cmp_lt_i32 s0, 31
; GFX11-NEXT: s_cselect_b32 s3, -1, 0
; GFX11-NEXT: s_cmpk_lg_i32 s0, 0x40f
; GFX11-NEXT: s_cselect_b32 s0, -1, 0
; GFX11-NEXT: s_and_b32 s0, s0, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, exec_lo
; GFX11-NEXT: s_cselect_b32 s0, s2, 0x7c00
; GFX11-NEXT: s_lshr_b32 s1, s1, 16
; GFX11-NEXT: s_or_b32 s0, s1, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, s0, 0x8000
; GFX11-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc double %sign to half
%op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
%cast = bitcast half %op to i16
%zext = zext i16 %cast to i32
ret i32 %zext
}
define half @v_copysign_f16_0_f64(double %sign) {
; SI-LABEL: v_copysign_f16_0_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
; SI-NEXT: v_bfe_u32 v3, v1, 20, 11
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3
; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
; SI-NEXT: v_med3_i32 v4, v4, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
; SI-NEXT: s_movk_i32 s4, 0xfc10
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3
; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
; SI-NEXT: v_or_b32_e32 v2, v5, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v4
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; SI-NEXT: v_and_b32_e32 v2, 7, v0
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v2
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: v_or_b32_e32 v2, v2, v4
; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; SI-NEXT: s_movk_i32 s4, 0x40f
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], s4, v3
; SI-NEXT: v_mov_b32_e32 v2, 0x7c00
; SI-NEXT: s_and_b64 vcc, s[4:5], vcc
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_and_b32_e32 v0, 0x7fff8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16_0_f64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, 0x8000
; VI-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_f16_0_f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_f16_0_f64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x8000, v1.h
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_f16_0_f64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc double %sign to half
%op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
ret half %op
}
define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) {
; SI-LABEL: s_copysign_v2f16_0_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s0, 0x80008000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v2f16_0_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_bfi_b32 v0, s1, 0, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v2f16_0_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v2f16_0_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX11-NEXT: ; return to shader part epilog
%op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign)
%cast = bitcast <2 x half> %op to i32
ret i32 %cast
}
define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) {
; SI-LABEL: v_copysign_v2f16_0_v2f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v2f16_0_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v2f16_0_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_v2f16_0_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign)
ret <2 x half> %op
}
define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
; SI-LABEL: s_copysign_v2f16_0_v2f32:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f16_f32_e32 v1, s1
; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v2f16_0_v2f32:
; VI: ; %bb.0:
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cvt_f16_f32_e32 v1, s0
; VI-NEXT: v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: s_mov_b32 s0, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_bfi_b32 v0, s0, 0, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v2f16_0_v2f32:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, s1
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, s0
; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-TRUE16-LABEL: s_copysign_v2f16_0_v2f32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, s1
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-TRUE16-NEXT: ; return to shader part epilog
;
; GFX11-FAKE16-LABEL: s_copysign_v2f16_0_v2f32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, s1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, s0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-FAKE16-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc <2 x float> %sign to <2 x half>
%op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
%cast = bitcast <2 x half> %op to i32
ret i32 %cast
}
define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
; SI-LABEL: v_copysign_v2f16_0_v2bf32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v2f16_0_v2bf32:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_cvt_f16_f32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v2f16_0_v2bf32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_v2f16_0_v2bf32:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.h, v1
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v1.l, v0
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x80008000, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_v2f16_0_v2bf32:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <2 x float> %sign to <2 x half>
%op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
ret <2 x half> %op
}
define amdgpu_ps i32 @s_copysign_v2f16_0_v2f64(<2 x double> inreg %sign) {
; SI-LABEL: s_copysign_v2f16_0_v2f64:
; SI: ; %bb.0:
; SI-NEXT: s_and_b32 s0, s1, 0x80000000
; SI-NEXT: s_and_b32 s1, s3, 0x80000000
; SI-NEXT: s_lshr_b32 s1, s1, 16
; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], 16
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_v2f16_0_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_lshl_b32 s0, s3, 16
; VI-NEXT: s_and_b32 s1, s1, 0x8000
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: s_mov_b32 s1, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_bfi_b32 v0, s1, 0, v0
; VI-NEXT: v_readfirstlane_b32 s0, v0
; VI-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_copysign_v2f16_0_v2f64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s1, s3
; GFX9-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX9-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_v2f16_0_v2f64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_pack_ll_b32_b16 s0, s1, s3
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_b32 s0, 0x80008000, s0
; GFX11-NEXT: ; return to shader part epilog
%sign.trunc = fptrunc <2 x double> %sign to <2 x half>
%op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
%cast = bitcast <2 x half> %op to i32
ret i32 %cast
}
define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) {
; SI-LABEL: v_copysign_v2f16_0_v2bf64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v1
; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v3
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_v2f16_0_v2bf64:
; VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; VI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: s_mov_b32 s4, 0x7fff7fff
; VI-NEXT: v_bfi_b32 v0, s4, 0, v0
; VI-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_copysign_v2f16_0_v2bf64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, 0x5040100
; GFX9-NEXT: v_perm_b32 v0, v3, v1, s4
; GFX9-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: v_copysign_v2f16_0_v2bf64:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x80008000, v1
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-FAKE16-LABEL: v_copysign_v2f16_0_v2bf64:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-FAKE16-NEXT: v_perm_b32 v0, v3, v1, 0x5040100
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x80008000, v0
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
%sign.trunc = fptrunc <2 x double> %sign to <2 x half>
%op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
ret <2 x half> %op
}
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }