Use the default, which freely coalesces anything it can. This mostly shows improvements, with a handful of regressions. The main concern would be if introducing wider registers is more likely to push the register usage up to the next occupancy tier.
3343 lines
138 KiB
LLVM
3343 lines
138 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GCN,GFX7 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck -check-prefixes=GCN,GFX8 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16, -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12 %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250 %s
|
|
|
|
define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
|
|
; GCN-LABEL: s_mul_i16:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_mul_i32 s0, s0, s1
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i16:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i16:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define i16 @v_mul_i16(i16 %num, i16 %den) {
|
|
; GFX7-LABEL: v_mul_i16:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i16:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i16:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i16:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_mul_i16:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_mul_i16:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i16:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i16:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inreg zeroext %den) {
|
|
; GFX7-LABEL: s_mul_i16_zeroext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i16_zeroext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX8-NEXT: s_and_b32 s0, 0xffff, s0
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i16_zeroext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX9-NEXT: s_and_b32 s0, 0xffff, s0
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i16_zeroext:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10PLUS-NEXT: s_and_b32 s0, 0xffff, s0
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i16_zeroext:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_and_b32 s0, 0xffff, s0
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i16_zeroext:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-NEXT: s_and_b32 s0, 0xffff, s0
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) {
|
|
; GFX7-LABEL: v_mul_i16_zeroext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i16_zeroext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i16_zeroext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i16_zeroext:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_mul_i16_zeroext:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
|
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_mul_i16_zeroext:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i16_zeroext:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i16_zeroext:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inreg signext %den) {
|
|
; GCN-LABEL: s_mul_i16_signext:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_mul_i32 s0, s0, s1
|
|
; GCN-NEXT: s_sext_i32_i16 s0, s0
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i16_signext:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10PLUS-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i16_signext:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i16_signext:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-NEXT: s_sext_i32_i16 s0, s0
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define signext i16 @v_mul_i16_signext(i16 signext %num, i16 signext %den) {
|
|
; GFX7-LABEL: v_mul_i16_signext:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
|
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1
|
|
; GFX7-NEXT: v_mul_u32_u24_e32 v0, v0, v1
|
|
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i16_signext:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i16_signext:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v1
|
|
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i16_signext:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-TRUE16-LABEL: v_mul_i16_signext:
|
|
; GFX11-TRUE16: ; %bb.0:
|
|
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l
|
|
; GFX11-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-FAKE16-LABEL: v_mul_i16_signext:
|
|
; GFX11-FAKE16: ; %bb.0:
|
|
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-FAKE16-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX11-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i16_signext:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i16_signext:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mul_lo_u16 v0, v0, v1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 16
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i16 %num, %den
|
|
ret i16 %result
|
|
}
|
|
|
|
define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
|
|
; GCN-LABEL: s_mul_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_mul_i32 s0, s0, s1
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i32:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i32:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i32:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s1
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i32 %num, %den
|
|
ret i32 %result
|
|
}
|
|
|
|
define i32 @v_mul_i32(i32 %num, i32 %den) {
|
|
; GCN-LABEL: v_mul_i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mul_lo_u32 v0, v0, v1
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10PLUS-LABEL: v_mul_i32:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v1
|
|
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i32:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v1
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i32:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v1
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i32 %num, %den
|
|
ret i32 %result
|
|
}
|
|
|
|
define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) {
|
|
; GCN-LABEL: s_mul_v2i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_mul_i32 s0, s0, s2
|
|
; GCN-NEXT: s_mul_i32 s1, s1, s3
|
|
; GCN-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_v2i32:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
|
|
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s3
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_v2i32:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s2
|
|
; GFX12-NEXT: s_mul_i32 s1, s1, s3
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_v2i32:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s2
|
|
; GFX1250-NEXT: s_mul_i32 s1, s1, s3
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul <2 x i32> %num, %den
|
|
ret <2 x i32> %result
|
|
}
|
|
|
|
define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
|
|
; GCN-LABEL: v_mul_v2i32:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GCN-NEXT: v_mul_lo_u32 v1, v1, v3
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10PLUS-LABEL: v_mul_v2i32:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10PLUS-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GFX10PLUS-NEXT: v_mul_lo_u32 v1, v1, v3
|
|
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_v2i32:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GFX12-NEXT: v_mul_lo_u32 v1, v1, v3
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_v2i32:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GFX1250-NEXT: v_mul_lo_u32 v1, v1, v3
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul <2 x i32> %num, %den
|
|
ret <2 x i32> %result
|
|
}
|
|
|
|
define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i33:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX7-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s5
|
|
; GFX7-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX7-NEXT: s_mov_b32 s0, s4
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i33:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX8-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX8-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s5
|
|
; GFX8-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX8-NEXT: s_mov_b32 s0, s4
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i33:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s5
|
|
; GFX9-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX9-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX9-NEXT: s_mov_b32 s0, s4
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i33:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2
|
|
; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3
|
|
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
|
|
; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i33:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i33:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i33 %num, %den
|
|
ret i33 %result
|
|
}
|
|
|
|
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i64:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX7-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s5
|
|
; GFX7-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX7-NEXT: s_mov_b32 s0, s4
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i64:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX8-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX8-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s5
|
|
; GFX8-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX8-NEXT: s_mov_b32 s0, s4
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i64:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s4, s0, s2
|
|
; GFX9-NEXT: s_mul_hi_u32 s5, s0, s2
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s3
|
|
; GFX9-NEXT: s_add_u32 s0, s0, s5
|
|
; GFX9-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX9-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX9-NEXT: s_mov_b32 s0, s4
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i64:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s4, s0, s2
|
|
; GFX10PLUS-NEXT: s_mul_i32 s3, s0, s3
|
|
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s2
|
|
; GFX10PLUS-NEXT: s_add_i32 s3, s4, s3
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s2
|
|
; GFX10PLUS-NEXT: s_add_i32 s1, s3, s1
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i64:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i64:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3]
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i64 %num, %den
|
|
ret i64 %result
|
|
}
|
|
|
|
define i64 @v_mul_i64(i64 %num, i64 %den) {
|
|
; GCN-LABEL: v_mul_i64:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v4, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v5, v2
|
|
; GCN-NEXT: v_mov_b32_e32 v6, v1
|
|
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v5, 0
|
|
; GCN-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v4, v3, v[1:2]
|
|
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v5, v[7:8]
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i64:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mov_b32_e32 v4, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v5, v1
|
|
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4]
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_mul_i64:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2
|
|
; GFX11-NEXT: v_mov_b32_e32 v6, v1
|
|
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0
|
|
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8]
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i64:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5]
|
|
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i64:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[2:3]
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i64 %num, %den
|
|
ret i64 %result
|
|
}
|
|
|
|
define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i96:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s3
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s4
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1
|
|
; GFX7-NEXT: s_mul_i32 s5, s0, s5
|
|
; GFX7-NEXT: v_readfirstlane_b32 s7, v0
|
|
; GFX7-NEXT: s_mul_i32 s8, s1, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX7-NEXT: s_add_u32 s5, s8, s5
|
|
; GFX7-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s3
|
|
; GFX7-NEXT: s_mul_i32 s6, s0, s3
|
|
; GFX7-NEXT: s_add_u32 s2, s2, s5
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s4
|
|
; GFX7-NEXT: v_readfirstlane_b32 s4, v1
|
|
; GFX7-NEXT: s_add_u32 s0, s0, s7
|
|
; GFX7-NEXT: s_addc_u32 s2, s4, s2
|
|
; GFX7-NEXT: s_mul_i32 s1, s1, s3
|
|
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX7-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX7-NEXT: s_addc_u32 s2, s3, s2
|
|
; GFX7-NEXT: s_mov_b32 s0, s6
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i96:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s3
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s4
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1
|
|
; GFX8-NEXT: s_mul_i32 s5, s0, s5
|
|
; GFX8-NEXT: v_readfirstlane_b32 s7, v0
|
|
; GFX8-NEXT: s_mul_i32 s8, s1, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX8-NEXT: s_add_u32 s5, s8, s5
|
|
; GFX8-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s3
|
|
; GFX8-NEXT: s_mul_i32 s6, s0, s3
|
|
; GFX8-NEXT: s_add_u32 s2, s2, s5
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s4
|
|
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
|
|
; GFX8-NEXT: s_add_u32 s0, s0, s7
|
|
; GFX8-NEXT: s_addc_u32 s2, s4, s2
|
|
; GFX8-NEXT: s_mul_i32 s1, s1, s3
|
|
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX8-NEXT: s_add_u32 s1, s1, s0
|
|
; GFX8-NEXT: s_addc_u32 s2, s3, s2
|
|
; GFX8-NEXT: s_mov_b32 s0, s6
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i96:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s5, s0, s5
|
|
; GFX9-NEXT: s_mul_i32 s8, s1, s4
|
|
; GFX9-NEXT: s_add_u32 s5, s8, s5
|
|
; GFX9-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX9-NEXT: s_mul_hi_u32 s7, s0, s3
|
|
; GFX9-NEXT: s_add_u32 s2, s2, s5
|
|
; GFX9-NEXT: s_mul_i32 s5, s0, s4
|
|
; GFX9-NEXT: s_mul_i32 s6, s0, s3
|
|
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s4
|
|
; GFX9-NEXT: s_add_u32 s4, s5, s7
|
|
; GFX9-NEXT: s_addc_u32 s0, s0, s2
|
|
; GFX9-NEXT: s_mul_i32 s2, s1, s3
|
|
; GFX9-NEXT: s_mul_hi_u32 s3, s1, s3
|
|
; GFX9-NEXT: s_add_u32 s1, s2, s4
|
|
; GFX9-NEXT: s_addc_u32 s2, s3, s0
|
|
; GFX9-NEXT: s_mov_b32 s0, s6
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i96:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s6, s0, s5
|
|
; GFX10PLUS-NEXT: s_mul_i32 s7, s1, s4
|
|
; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX10PLUS-NEXT: s_add_i32 s6, s6, s7
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s7, s0, s3
|
|
; GFX10PLUS-NEXT: s_add_i32 s6, s6, s2
|
|
; GFX10PLUS-NEXT: s_mul_i32 s2, s0, s4
|
|
; GFX10PLUS-NEXT: s_mul_i32 s5, s0, s3
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s0, s0, s4
|
|
; GFX10PLUS-NEXT: s_add_u32 s2, s2, s7
|
|
; GFX10PLUS-NEXT: s_mul_i32 s4, s1, s3
|
|
; GFX10PLUS-NEXT: s_addc_u32 s0, s0, s6
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s3, s1, s3
|
|
; GFX10PLUS-NEXT: s_add_u32 s1, s4, s2
|
|
; GFX10PLUS-NEXT: s_addc_u32 s2, s3, s0
|
|
; GFX10PLUS-NEXT: s_mov_b32 s0, s5
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i96:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s6, s0, s5
|
|
; GFX12-NEXT: s_mul_i32 s7, s1, s4
|
|
; GFX12-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX12-NEXT: s_add_co_i32 s6, s6, s7
|
|
; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3
|
|
; GFX12-NEXT: s_add_co_i32 s6, s6, s2
|
|
; GFX12-NEXT: s_mul_i32 s2, s0, s4
|
|
; GFX12-NEXT: s_mul_i32 s5, s0, s3
|
|
; GFX12-NEXT: s_mul_hi_u32 s0, s0, s4
|
|
; GFX12-NEXT: s_add_co_u32 s2, s2, s7
|
|
; GFX12-NEXT: s_mul_i32 s4, s1, s3
|
|
; GFX12-NEXT: s_add_co_ci_u32 s0, s0, s6
|
|
; GFX12-NEXT: s_mul_hi_u32 s3, s1, s3
|
|
; GFX12-NEXT: s_add_co_u32 s1, s4, s2
|
|
; GFX12-NEXT: s_add_co_ci_u32 s2, s3, s0
|
|
; GFX12-NEXT: s_mov_b32 s0, s5
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i96:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s6, s0, s5
|
|
; GFX1250-NEXT: s_mul_i32 s7, s1, s4
|
|
; GFX1250-NEXT: s_mul_i32 s2, s2, s3
|
|
; GFX1250-NEXT: s_add_co_i32 s6, s6, s7
|
|
; GFX1250-NEXT: s_mul_hi_u32 s7, s0, s3
|
|
; GFX1250-NEXT: s_add_co_i32 s6, s6, s2
|
|
; GFX1250-NEXT: s_mul_i32 s2, s0, s4
|
|
; GFX1250-NEXT: s_mul_i32 s5, s0, s3
|
|
; GFX1250-NEXT: s_mul_hi_u32 s0, s0, s4
|
|
; GFX1250-NEXT: s_add_co_u32 s2, s2, s7
|
|
; GFX1250-NEXT: s_mul_i32 s4, s1, s3
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s0, s0, s6
|
|
; GFX1250-NEXT: s_mul_hi_u32 s3, s1, s3
|
|
; GFX1250-NEXT: s_add_co_u32 s1, s4, s2
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s2, s3, s0
|
|
; GFX1250-NEXT: s_mov_b32 s0, s5
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i96 %num, %den
|
|
%cast = bitcast i96 %result to <3 x i32>
|
|
ret <3 x i32> %cast
|
|
}
|
|
|
|
define i96 @v_mul_i96(i96 %num, i96 %den) {
|
|
; GCN-LABEL: v_mul_i96:
|
|
; GCN: ; %bb.0:
|
|
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GCN-NEXT: v_mov_b32_e32 v6, v0
|
|
; GCN-NEXT: v_mov_b32_e32 v7, v1
|
|
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v5, 0
|
|
; GCN-NEXT: v_mov_b32_e32 v8, v2
|
|
; GCN-NEXT: v_mov_b32_e32 v9, v3
|
|
; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v7, v4, v[0:1]
|
|
; GCN-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, 0
|
|
; GCN-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v9, v[10:11]
|
|
; GCN-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v4, v[1:2]
|
|
; GCN-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v9, v[10:11]
|
|
; GCN-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i96:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mov_b32_e32 v6, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v7, v1
|
|
; GFX10-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5
|
|
; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v7, v4, v[0:1]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v8, 0
|
|
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v2, v8, v[9:10]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[9:10], s4, v6, v4, v[1:2]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v8, v[9:10]
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_mul_i96:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
|
|
; GFX11-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v9, v3
|
|
; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5
|
|
; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v7, v4, v[0:1]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v9, 0
|
|
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v8, v9, v[10:11]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[10:11], null, v6, v4, v[1:2]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v9, v[10:11]
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i96:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
|
|
; GFX12-NEXT: v_mov_b32_e32 v8, v3
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_mul_lo_u32 v0, v6, v5
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v7, v4, v[0:1]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v8, 0
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v2, v8, v[9:10]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v6, v4, v[1:2]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v7, v8, v[9:10]
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i96:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mul_lo_u32 v0, v7, v4
|
|
; GFX1250-NEXT: v_mad_u32 v5, v6, v5, v0
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v3, 0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_mad_u32 v9, v2, v3, v5
|
|
; GFX1250-NEXT: v_mov_b32_e32 v8, v1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v6, v4, v[8:9]
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], v7, v3, v[10:11]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i96 %num, %den
|
|
ret i96 %result
|
|
}
|
|
|
|
define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i128:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1
|
|
; GFX7-NEXT: s_mul_i32 s10, s0, s6
|
|
; GFX7-NEXT: v_readfirstlane_b32 s9, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s13, v2
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, v2, s4
|
|
; GFX7-NEXT: s_mul_i32 s12, s1, s5
|
|
; GFX7-NEXT: v_readfirstlane_b32 s11, v0
|
|
; GFX7-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, s0, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX7-NEXT: s_addc_u32 s11, s13, s11
|
|
; GFX7-NEXT: s_mul_i32 s12, s2, s4
|
|
; GFX7-NEXT: v_readfirstlane_b32 s13, v2
|
|
; GFX7-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s4
|
|
; GFX7-NEXT: s_addc_u32 s11, s13, s11
|
|
; GFX7-NEXT: s_mul_i32 s12, s0, s5
|
|
; GFX7-NEXT: v_readfirstlane_b32 s13, v1
|
|
; GFX7-NEXT: s_add_u32 s9, s12, s9
|
|
; GFX7-NEXT: s_addc_u32 s10, s13, s10
|
|
; GFX7-NEXT: s_mul_i32 s13, s1, s4
|
|
; GFX7-NEXT: s_cselect_b32 s12, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s14, v0
|
|
; GFX7-NEXT: s_add_u32 s9, s13, s9
|
|
; GFX7-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX7-NEXT: s_addc_u32 s10, s14, s10
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s7
|
|
; GFX7-NEXT: s_addc_u32 s0, s11, s0
|
|
; GFX7-NEXT: s_mul_i32 s1, s1, s6
|
|
; GFX7-NEXT: s_cmp_lg_u32 s12, 0
|
|
; GFX7-NEXT: s_addc_u32 s0, s0, s1
|
|
; GFX7-NEXT: s_mul_i32 s2, s2, s5
|
|
; GFX7-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX7-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX7-NEXT: s_add_u32 s3, s3, s0
|
|
; GFX7-NEXT: s_mov_b32 s0, s8
|
|
; GFX7-NEXT: s_mov_b32 s1, s9
|
|
; GFX7-NEXT: s_mov_b32 s2, s10
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i128:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1
|
|
; GFX8-NEXT: s_mul_i32 s10, s0, s6
|
|
; GFX8-NEXT: v_readfirstlane_b32 s9, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s6
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s13, v2
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, v2, s4
|
|
; GFX8-NEXT: s_mul_i32 s12, s1, s5
|
|
; GFX8-NEXT: v_readfirstlane_b32 s11, v0
|
|
; GFX8-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX8-NEXT: s_addc_u32 s11, s13, s11
|
|
; GFX8-NEXT: s_mul_i32 s12, s2, s4
|
|
; GFX8-NEXT: v_readfirstlane_b32 s13, v2
|
|
; GFX8-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s4
|
|
; GFX8-NEXT: s_addc_u32 s11, s13, s11
|
|
; GFX8-NEXT: s_mul_i32 s12, s0, s5
|
|
; GFX8-NEXT: v_readfirstlane_b32 s13, v1
|
|
; GFX8-NEXT: s_add_u32 s9, s12, s9
|
|
; GFX8-NEXT: s_addc_u32 s10, s13, s10
|
|
; GFX8-NEXT: s_mul_i32 s13, s1, s4
|
|
; GFX8-NEXT: s_cselect_b32 s12, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s14, v0
|
|
; GFX8-NEXT: s_add_u32 s9, s13, s9
|
|
; GFX8-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX8-NEXT: s_addc_u32 s10, s14, s10
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s7
|
|
; GFX8-NEXT: s_addc_u32 s0, s11, s0
|
|
; GFX8-NEXT: s_mul_i32 s1, s1, s6
|
|
; GFX8-NEXT: s_cmp_lg_u32 s12, 0
|
|
; GFX8-NEXT: s_addc_u32 s0, s0, s1
|
|
; GFX8-NEXT: s_mul_i32 s2, s2, s5
|
|
; GFX8-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX8-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX8-NEXT: s_add_u32 s3, s3, s0
|
|
; GFX8-NEXT: s_mov_b32 s0, s8
|
|
; GFX8-NEXT: s_mov_b32 s1, s9
|
|
; GFX8-NEXT: s_mov_b32 s2, s10
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i128:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mul_i32 s10, s0, s6
|
|
; GFX9-NEXT: s_mul_i32 s12, s1, s5
|
|
; GFX9-NEXT: s_mul_hi_u32 s11, s0, s6
|
|
; GFX9-NEXT: s_mul_hi_u32 s13, s1, s5
|
|
; GFX9-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX9-NEXT: s_addc_u32 s11, s13, s11
|
|
; GFX9-NEXT: s_mul_i32 s12, s2, s4
|
|
; GFX9-NEXT: s_mul_hi_u32 s13, s2, s4
|
|
; GFX9-NEXT: s_add_u32 s10, s12, s10
|
|
; GFX9-NEXT: s_mul_hi_u32 s9, s0, s4
|
|
; GFX9-NEXT: s_addc_u32 s11, s13, s11
|
|
; GFX9-NEXT: s_mul_i32 s12, s0, s5
|
|
; GFX9-NEXT: s_mul_hi_u32 s13, s0, s5
|
|
; GFX9-NEXT: s_add_u32 s9, s12, s9
|
|
; GFX9-NEXT: s_addc_u32 s10, s13, s10
|
|
; GFX9-NEXT: s_mul_i32 s13, s1, s4
|
|
; GFX9-NEXT: s_cselect_b32 s12, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s14, s1, s4
|
|
; GFX9-NEXT: s_add_u32 s9, s13, s9
|
|
; GFX9-NEXT: s_mul_i32 s8, s0, s4
|
|
; GFX9-NEXT: s_addc_u32 s10, s14, s10
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s7
|
|
; GFX9-NEXT: s_addc_u32 s0, s11, s0
|
|
; GFX9-NEXT: s_mul_i32 s1, s1, s6
|
|
; GFX9-NEXT: s_cmp_lg_u32 s12, 0
|
|
; GFX9-NEXT: s_addc_u32 s0, s0, s1
|
|
; GFX9-NEXT: s_mul_i32 s2, s2, s5
|
|
; GFX9-NEXT: s_add_u32 s0, s2, s0
|
|
; GFX9-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX9-NEXT: s_add_u32 s3, s3, s0
|
|
; GFX9-NEXT: s_mov_b32 s0, s8
|
|
; GFX9-NEXT: s_mov_b32 s1, s9
|
|
; GFX9-NEXT: s_mov_b32 s2, s10
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i128:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s9, s0, s6
|
|
; GFX10PLUS-NEXT: s_mul_i32 s11, s1, s5
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s10, s0, s6
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s1, s5
|
|
; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9
|
|
; GFX10PLUS-NEXT: s_mul_i32 s11, s2, s4
|
|
; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s2, s4
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s8, s0, s4
|
|
; GFX10PLUS-NEXT: s_add_u32 s9, s11, s9
|
|
; GFX10PLUS-NEXT: s_mul_i32 s11, s0, s5
|
|
; GFX10PLUS-NEXT: s_addc_u32 s10, s12, s10
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s12, s0, s5
|
|
; GFX10PLUS-NEXT: s_add_u32 s8, s11, s8
|
|
; GFX10PLUS-NEXT: s_addc_u32 s9, s12, s9
|
|
; GFX10PLUS-NEXT: s_mul_i32 s12, s1, s4
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s13, s1, s4
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s8, s12, s8
|
|
; GFX10PLUS-NEXT: s_mul_i32 s12, s0, s7
|
|
; GFX10PLUS-NEXT: s_addc_u32 s7, s13, s9
|
|
; GFX10PLUS-NEXT: s_addc_u32 s9, s10, s12
|
|
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s6
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s5
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s9, s1
|
|
; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX10PLUS-NEXT: s_add_i32 s1, s1, s2
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s4
|
|
; GFX10PLUS-NEXT: s_add_i32 s3, s1, s3
|
|
; GFX10PLUS-NEXT: s_mov_b32 s1, s8
|
|
; GFX10PLUS-NEXT: s_mov_b32 s2, s7
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i128:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s9, s0, s6
|
|
; GFX12-NEXT: s_mul_i32 s11, s1, s5
|
|
; GFX12-NEXT: s_mul_hi_u32 s10, s0, s6
|
|
; GFX12-NEXT: s_mul_hi_u32 s12, s1, s5
|
|
; GFX12-NEXT: s_add_co_u32 s9, s11, s9
|
|
; GFX12-NEXT: s_mul_i32 s11, s2, s4
|
|
; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10
|
|
; GFX12-NEXT: s_mul_hi_u32 s12, s2, s4
|
|
; GFX12-NEXT: s_mul_hi_u32 s8, s0, s4
|
|
; GFX12-NEXT: s_add_co_u32 s9, s11, s9
|
|
; GFX12-NEXT: s_mul_i32 s11, s0, s5
|
|
; GFX12-NEXT: s_add_co_ci_u32 s10, s12, s10
|
|
; GFX12-NEXT: s_mul_hi_u32 s12, s0, s5
|
|
; GFX12-NEXT: s_add_co_u32 s8, s11, s8
|
|
; GFX12-NEXT: s_add_co_ci_u32 s9, s12, s9
|
|
; GFX12-NEXT: s_mul_i32 s12, s1, s4
|
|
; GFX12-NEXT: s_mul_hi_u32 s13, s1, s4
|
|
; GFX12-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s8, s12, s8
|
|
; GFX12-NEXT: s_mul_i32 s12, s0, s7
|
|
; GFX12-NEXT: s_add_co_ci_u32 s7, s13, s9
|
|
; GFX12-NEXT: s_add_co_ci_u32 s9, s10, s12
|
|
; GFX12-NEXT: s_mul_i32 s1, s1, s6
|
|
; GFX12-NEXT: s_cmp_lg_u32 s11, 0
|
|
; GFX12-NEXT: s_mul_i32 s2, s2, s5
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s9, s1
|
|
; GFX12-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX12-NEXT: s_add_co_i32 s1, s1, s2
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s4
|
|
; GFX12-NEXT: s_add_co_i32 s3, s1, s3
|
|
; GFX12-NEXT: s_mov_b32 s1, s8
|
|
; GFX12-NEXT: s_mov_b32 s2, s7
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i128:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s9, s0, s6
|
|
; GFX1250-NEXT: s_mul_i32 s11, s1, s5
|
|
; GFX1250-NEXT: s_mul_hi_u32 s10, s0, s6
|
|
; GFX1250-NEXT: s_mul_hi_u32 s12, s1, s5
|
|
; GFX1250-NEXT: s_add_co_u32 s9, s11, s9
|
|
; GFX1250-NEXT: s_mul_i32 s11, s2, s4
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10
|
|
; GFX1250-NEXT: s_mul_hi_u32 s12, s2, s4
|
|
; GFX1250-NEXT: s_mul_hi_u32 s8, s0, s4
|
|
; GFX1250-NEXT: s_add_co_u32 s9, s11, s9
|
|
; GFX1250-NEXT: s_mul_i32 s11, s0, s5
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s10, s12, s10
|
|
; GFX1250-NEXT: s_mul_hi_u32 s12, s0, s5
|
|
; GFX1250-NEXT: s_add_co_u32 s8, s11, s8
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s9, s12, s9
|
|
; GFX1250-NEXT: s_mul_i32 s12, s1, s4
|
|
; GFX1250-NEXT: s_mul_hi_u32 s13, s1, s4
|
|
; GFX1250-NEXT: s_cselect_b32 s11, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s8, s12, s8
|
|
; GFX1250-NEXT: s_mul_i32 s12, s0, s7
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s7, s13, s9
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s9, s10, s12
|
|
; GFX1250-NEXT: s_mul_i32 s1, s1, s6
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s11, 0
|
|
; GFX1250-NEXT: s_mul_i32 s2, s2, s5
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s9, s1
|
|
; GFX1250-NEXT: s_mul_i32 s3, s3, s4
|
|
; GFX1250-NEXT: s_add_co_i32 s1, s1, s2
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s4
|
|
; GFX1250-NEXT: s_add_co_i32 s3, s1, s3
|
|
; GFX1250-NEXT: s_mov_b32 s1, s8
|
|
; GFX1250-NEXT: s_mov_b32 s2, s7
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i128 %num, %den
|
|
%cast = bitcast i128 %result to <4 x i32>
|
|
ret <4 x i32> %cast
|
|
}
|
|
|
|
define i128 @v_mul_i128(i128 %num, i128 %den) {
|
|
; GFX7-LABEL: v_mul_i128:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
|
|
; GFX7-NEXT: v_mov_b32_e32 v10, v2
|
|
; GFX7-NEXT: v_mov_b32_e32 v12, v4
|
|
; GFX7-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v11, v3
|
|
; GFX7-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
|
|
; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
|
|
; GFX7-NEXT: v_mul_lo_u32 v4, v9, v6
|
|
; GFX7-NEXT: v_mul_lo_u32 v6, v8, v7
|
|
; GFX7-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
|
|
; GFX7-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5]
|
|
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GFX7-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i128:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
|
|
; GFX8-NEXT: v_mov_b32_e32 v10, v2
|
|
; GFX8-NEXT: v_mov_b32_e32 v12, v4
|
|
; GFX8-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v11, v3
|
|
; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
|
|
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
|
|
; GFX8-NEXT: v_mul_lo_u32 v4, v9, v6
|
|
; GFX8-NEXT: v_mul_lo_u32 v6, v8, v7
|
|
; GFX8-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
|
|
; GFX8-NEXT: v_addc_u32_e64 v3, s[4:5], v3, v6, s[4:5]
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GFX8-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i128:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX9-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v6, 0
|
|
; GFX9-NEXT: v_mov_b32_e32 v10, v2
|
|
; GFX9-NEXT: v_mov_b32_e32 v12, v4
|
|
; GFX9-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v9, v5, v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v11, v3
|
|
; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v8, v12, 0
|
|
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v12, v[13:14]
|
|
; GFX9-NEXT: v_mul_lo_u32 v4, v9, v6
|
|
; GFX9-NEXT: v_mul_lo_u32 v6, v8, v7
|
|
; GFX9-NEXT: v_mad_u64_u32 v[13:14], vcc, v8, v5, v[1:2]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[13:14]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v6, s[4:5]
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
|
|
; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v10, v5, v[3:4]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v12, v[6:7]
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i128:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mov_b32_e32 v8, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v9, v1
|
|
; GFX10-NEXT: v_mov_b32_e32 v10, v2
|
|
; GFX10-NEXT: v_mov_b32_e32 v11, v3
|
|
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v6, 0
|
|
; GFX10-NEXT: v_mul_lo_u32 v7, v8, v7
|
|
; GFX10-NEXT: v_mul_lo_u32 v6, v9, v6
|
|
; GFX10-NEXT: v_mad_u64_u32 v[12:13], s4, v9, v5, v[0:1]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v8, v4, 0
|
|
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v10, v4, v[12:13]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v9, v4, v[12:13]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, v3, v7, s4
|
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v6, vcc_lo
|
|
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v10, v5, v[3:4]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v11, v4, v[5:6]
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_mul_i128:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
|
|
; GFX11-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v4
|
|
; GFX11-NEXT: v_mov_b32_e32 v12, v3
|
|
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v6, 0
|
|
; GFX11-NEXT: v_mul_lo_u32 v4, v9, v6
|
|
; GFX11-NEXT: v_mul_lo_u32 v6, v8, v7
|
|
; GFX11-NEXT: v_mad_u64_u32 v[13:14], null, v9, v5, v[0:1]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v8, v11, 0
|
|
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v10, v11, v[13:14]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[13:14], vcc_lo, v8, v5, v[1:2]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s0, v9, v11, v[13:14]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, s0
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v4, vcc_lo
|
|
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v10, v5, v[3:4]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v12, v11, v[6:7]
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i128:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
|
|
; GFX12-NEXT: v_dual_mov_b32 v10, v2 :: v_dual_mov_b32 v11, v3
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v6, 0
|
|
; GFX12-NEXT: v_mul_lo_u32 v7, v8, v7
|
|
; GFX12-NEXT: v_mul_lo_u32 v6, v9, v6
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], null, v9, v5, v[0:1]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v8, v4, 0
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v10, v4, v[12:13]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], vcc_lo, v8, v5, v[1:2]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s0, v9, v4, v[12:13]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v7, s0
|
|
; GFX12-NEXT: s_wait_alu 0xfffd
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, v3, v6, vcc_lo
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v10, v5, v[3:4]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v11, v4, v[5:6]
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i128:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_dual_mov_b32 v8, v0 :: v_dual_mov_b32 v9, v1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v6, 0
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[10:11], v9, v5, v[0:1]
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v8, v4, 0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[12:13], v2, v4, v[10:11]
|
|
; GFX1250-NEXT: v_mov_b32_e32 v10, v1
|
|
; GFX1250-NEXT: v_mul_lo_u32 v1, v9, v6
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mov_b32_e32 v11, v12
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], vcc_lo, v8, v5, v[10:11]
|
|
; GFX1250-NEXT: v_mul_lo_u32 v8, v8, v7
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[6:7], s0, v9, v4, v[14:15]
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v8, null, v13, v8, s0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v8, v1, vcc_lo
|
|
; GFX1250-NEXT: v_mad_u32 v1, v2, v5, v1
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, v7
|
|
; GFX1250-NEXT: v_mad_u32 v3, v3, v4, v1
|
|
; GFX1250-NEXT: v_mov_b32_e32 v1, v6
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i128 %num, %den
|
|
ret i128 %result
|
|
}
|
|
|
|
define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
|
|
; GFX7-LABEL: s_mul_i256:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mov_b32 s16, s0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s9
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, s1, v1
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, s16, v1
|
|
; GFX7-NEXT: v_readfirstlane_b32 s17, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s21, v2
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX7-NEXT: v_mul_hi_u32 v3, v2, s8
|
|
; GFX7-NEXT: s_mul_i32 s18, s16, s10
|
|
; GFX7-NEXT: s_mul_i32 s20, s1, s9
|
|
; GFX7-NEXT: v_readfirstlane_b32 s19, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX7-NEXT: s_add_u32 s18, s20, s18
|
|
; GFX7-NEXT: s_addc_u32 s19, s21, s19
|
|
; GFX7-NEXT: s_mul_i32 s21, s2, s8
|
|
; GFX7-NEXT: v_readfirstlane_b32 s23, v1
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, v0, s8
|
|
; GFX7-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s22, v3
|
|
; GFX7-NEXT: s_add_u32 s18, s21, s18
|
|
; GFX7-NEXT: s_addc_u32 s19, s22, s19
|
|
; GFX7-NEXT: s_mul_i32 s22, s16, s9
|
|
; GFX7-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s17, s22, s17
|
|
; GFX7-NEXT: s_addc_u32 s22, s23, s18
|
|
; GFX7-NEXT: v_readfirstlane_b32 s23, v1
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s12
|
|
; GFX7-NEXT: v_mul_hi_u32 v3, s16, v1
|
|
; GFX7-NEXT: s_mul_i32 s18, s1, s8
|
|
; GFX7-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s18, s18, s17
|
|
; GFX7-NEXT: s_addc_u32 s17, s23, s22
|
|
; GFX7-NEXT: v_mov_b32_e32 v4, s11
|
|
; GFX7-NEXT: v_readfirstlane_b32 s23, v3
|
|
; GFX7-NEXT: v_mul_hi_u32 v3, v2, s10
|
|
; GFX7-NEXT: v_mul_hi_u32 v5, s1, v4
|
|
; GFX7-NEXT: s_mul_i32 s22, s16, s12
|
|
; GFX7-NEXT: s_mul_i32 s24, s1, s11
|
|
; GFX7-NEXT: v_readfirstlane_b32 s28, v3
|
|
; GFX7-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX7-NEXT: v_readfirstlane_b32 s27, v5
|
|
; GFX7-NEXT: v_mul_hi_u32 v5, v3, s9
|
|
; GFX7-NEXT: s_cselect_b32 s26, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s24, s24, s22
|
|
; GFX7-NEXT: s_addc_u32 s23, s27, s23
|
|
; GFX7-NEXT: v_readfirstlane_b32 s29, v5
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX7-NEXT: v_mul_hi_u32 v6, v5, s8
|
|
; GFX7-NEXT: s_mul_i32 s27, s2, s10
|
|
; GFX7-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s24, s27, s24
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s10
|
|
; GFX7-NEXT: s_addc_u32 s27, s28, s23
|
|
; GFX7-NEXT: s_mul_i32 s28, s3, s9
|
|
; GFX7-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s28, s28, s24
|
|
; GFX7-NEXT: v_readfirstlane_b32 s30, v6
|
|
; GFX7-NEXT: v_mul_hi_u32 v6, s16, v4
|
|
; GFX7-NEXT: s_addc_u32 s27, s29, s27
|
|
; GFX7-NEXT: s_mul_i32 s29, s4, s8
|
|
; GFX7-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s28, s29, s28
|
|
; GFX7-NEXT: v_readfirstlane_b32 s33, v0
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v2, s9
|
|
; GFX7-NEXT: s_addc_u32 s27, s30, s27
|
|
; GFX7-NEXT: s_mul_i32 s30, s16, s11
|
|
; GFX7-NEXT: s_cselect_b32 s29, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s31, v6
|
|
; GFX7-NEXT: s_add_u32 s19, s30, s19
|
|
; GFX7-NEXT: s_addc_u32 s28, s31, s28
|
|
; GFX7-NEXT: s_mul_i32 s31, s1, s10
|
|
; GFX7-NEXT: s_cselect_b32 s30, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s19, s31, s19
|
|
; GFX7-NEXT: v_readfirstlane_b32 s34, v0
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v3, s8
|
|
; GFX7-NEXT: s_addc_u32 s28, s33, s28
|
|
; GFX7-NEXT: s_mul_i32 s33, s2, s9
|
|
; GFX7-NEXT: s_cselect_b32 s31, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s19, s33, s19
|
|
; GFX7-NEXT: s_addc_u32 s28, s34, s28
|
|
; GFX7-NEXT: s_mul_i32 s34, s3, s8
|
|
; GFX7-NEXT: s_cselect_b32 s33, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX7-NEXT: s_add_u32 s19, s34, s19
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s14
|
|
; GFX7-NEXT: s_addc_u32 s28, s35, s28
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s16, v0
|
|
; GFX7-NEXT: s_cselect_b32 s34, 1, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s26, 0
|
|
; GFX7-NEXT: s_addc_u32 s19, s25, s19
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, s13
|
|
; GFX7-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX7-NEXT: v_mul_hi_u32 v6, s1, v2
|
|
; GFX7-NEXT: s_addc_u32 s20, s20, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s26, v0
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s2, v1
|
|
; GFX7-NEXT: s_cmp_lg_u32 s25, 0
|
|
; GFX7-NEXT: s_addc_u32 s20, s20, s28
|
|
; GFX7-NEXT: s_mul_i32 s25, s16, s14
|
|
; GFX7-NEXT: s_mul_i32 s28, s1, s13
|
|
; GFX7-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
|
|
; GFX7-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX7-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v3, s11
|
|
; GFX7-NEXT: s_mul_i32 s28, s2, s12
|
|
; GFX7-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX7-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v5, s10
|
|
; GFX7-NEXT: s_mul_i32 s28, s3, s11
|
|
; GFX7-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX7-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s5
|
|
; GFX7-NEXT: v_mul_hi_u32 v6, v0, s9
|
|
; GFX7-NEXT: s_mul_i32 s28, s4, s10
|
|
; GFX7-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, s1, v1
|
|
; GFX7-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, s6
|
|
; GFX7-NEXT: v_mul_hi_u32 v6, v6, s8
|
|
; GFX7-NEXT: s_mul_i32 s28, s5, s9
|
|
; GFX7-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX7-NEXT: v_mul_hi_u32 v2, s16, v2
|
|
; GFX7-NEXT: v_readfirstlane_b32 s36, v1
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, s2, v4
|
|
; GFX7-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX7-NEXT: s_mul_i32 s28, s6, s8
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v6
|
|
; GFX7-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX7-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX7-NEXT: s_mul_i32 s28, s16, s13
|
|
; GFX7-NEXT: v_readfirstlane_b32 s35, v2
|
|
; GFX7-NEXT: s_add_u32 s27, s28, s27
|
|
; GFX7-NEXT: v_readfirstlane_b32 s37, v1
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, v3, s10
|
|
; GFX7-NEXT: s_addc_u32 s25, s35, s25
|
|
; GFX7-NEXT: s_mul_i32 s35, s1, s12
|
|
; GFX7-NEXT: s_cselect_b32 s28, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s27, s35, s27
|
|
; GFX7-NEXT: s_addc_u32 s25, s36, s25
|
|
; GFX7-NEXT: s_mul_i32 s36, s2, s11
|
|
; GFX7-NEXT: s_cselect_b32 s35, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s27, s36, s27
|
|
; GFX7-NEXT: v_readfirstlane_b32 s38, v1
|
|
; GFX7-NEXT: v_mul_hi_u32 v1, v5, s9
|
|
; GFX7-NEXT: s_addc_u32 s25, s37, s25
|
|
; GFX7-NEXT: s_mul_i32 s37, s3, s10
|
|
; GFX7-NEXT: s_cselect_b32 s36, 1, 0
|
|
; GFX7-NEXT: s_add_u32 s27, s37, s27
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, v0, s8
|
|
; GFX7-NEXT: s_addc_u32 s25, s38, s25
|
|
; GFX7-NEXT: s_mul_i32 s38, s4, s9
|
|
; GFX7-NEXT: s_cselect_b32 s37, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s39, v1
|
|
; GFX7-NEXT: s_add_u32 s27, s38, s27
|
|
; GFX7-NEXT: s_addc_u32 s25, s39, s25
|
|
; GFX7-NEXT: s_mul_i32 s39, s5, s8
|
|
; GFX7-NEXT: s_cselect_b32 s38, 1, 0
|
|
; GFX7-NEXT: v_readfirstlane_b32 s40, v0
|
|
; GFX7-NEXT: s_add_u32 s27, s39, s27
|
|
; GFX7-NEXT: s_addc_u32 s25, s40, s25
|
|
; GFX7-NEXT: s_cselect_b32 s39, 1, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s31, 0
|
|
; GFX7-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s33, 0
|
|
; GFX7-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s34, 0
|
|
; GFX7-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX7-NEXT: s_addc_u32 s21, s30, s27
|
|
; GFX7-NEXT: s_cselect_b32 s27, 1, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s23, 0
|
|
; GFX7-NEXT: s_addc_u32 s22, s22, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s24, 0
|
|
; GFX7-NEXT: s_addc_u32 s22, s22, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s29, 0
|
|
; GFX7-NEXT: s_addc_u32 s22, s22, 0
|
|
; GFX7-NEXT: s_cmp_lg_u32 s27, 0
|
|
; GFX7-NEXT: s_addc_u32 s22, s22, s25
|
|
; GFX7-NEXT: s_mul_i32 s16, s16, s15
|
|
; GFX7-NEXT: s_addc_u32 s15, s26, s16
|
|
; GFX7-NEXT: s_mul_i32 s1, s1, s14
|
|
; GFX7-NEXT: s_cmp_lg_u32 s39, 0
|
|
; GFX7-NEXT: s_addc_u32 s1, s15, s1
|
|
; GFX7-NEXT: s_mul_i32 s2, s2, s13
|
|
; GFX7-NEXT: s_cmp_lg_u32 s38, 0
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s2
|
|
; GFX7-NEXT: s_mul_i32 s3, s3, s12
|
|
; GFX7-NEXT: s_cmp_lg_u32 s37, 0
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX7-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX7-NEXT: s_cmp_lg_u32 s36, 0
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s4
|
|
; GFX7-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX7-NEXT: s_cmp_lg_u32 s35, 0
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX7-NEXT: s_mul_i32 s6, s6, s9
|
|
; GFX7-NEXT: s_cmp_lg_u32 s28, 0
|
|
; GFX7-NEXT: s_addc_u32 s1, s1, s6
|
|
; GFX7-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX7-NEXT: s_mul_i32 s0, s0, s8
|
|
; GFX7-NEXT: s_add_u32 s7, s7, s1
|
|
; GFX7-NEXT: s_mov_b32 s1, s18
|
|
; GFX7-NEXT: s_mov_b32 s2, s17
|
|
; GFX7-NEXT: s_mov_b32 s3, s19
|
|
; GFX7-NEXT: s_mov_b32 s4, s20
|
|
; GFX7-NEXT: s_mov_b32 s5, s21
|
|
; GFX7-NEXT: s_mov_b32 s6, s22
|
|
; GFX7-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX8-LABEL: s_mul_i256:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_mov_b32 s16, s0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s8
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s9
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, s1, v1
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, s16, v1
|
|
; GFX8-NEXT: v_readfirstlane_b32 s17, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s10
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s21, v2
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s2
|
|
; GFX8-NEXT: v_mul_hi_u32 v3, v2, s8
|
|
; GFX8-NEXT: s_mul_i32 s18, s16, s10
|
|
; GFX8-NEXT: s_mul_i32 s20, s1, s9
|
|
; GFX8-NEXT: v_readfirstlane_b32 s19, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s1
|
|
; GFX8-NEXT: s_add_u32 s18, s20, s18
|
|
; GFX8-NEXT: s_addc_u32 s19, s21, s19
|
|
; GFX8-NEXT: s_mul_i32 s21, s2, s8
|
|
; GFX8-NEXT: v_readfirstlane_b32 s23, v1
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, v0, s8
|
|
; GFX8-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s22, v3
|
|
; GFX8-NEXT: s_add_u32 s18, s21, s18
|
|
; GFX8-NEXT: s_addc_u32 s19, s22, s19
|
|
; GFX8-NEXT: s_mul_i32 s22, s16, s9
|
|
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s17, s22, s17
|
|
; GFX8-NEXT: s_addc_u32 s22, s23, s18
|
|
; GFX8-NEXT: v_readfirstlane_b32 s23, v1
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s12
|
|
; GFX8-NEXT: v_mul_hi_u32 v3, s16, v1
|
|
; GFX8-NEXT: s_mul_i32 s18, s1, s8
|
|
; GFX8-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s18, s18, s17
|
|
; GFX8-NEXT: s_addc_u32 s17, s23, s22
|
|
; GFX8-NEXT: v_mov_b32_e32 v4, s11
|
|
; GFX8-NEXT: v_readfirstlane_b32 s23, v3
|
|
; GFX8-NEXT: v_mul_hi_u32 v3, v2, s10
|
|
; GFX8-NEXT: v_mul_hi_u32 v5, s1, v4
|
|
; GFX8-NEXT: s_mul_i32 s22, s16, s12
|
|
; GFX8-NEXT: s_mul_i32 s24, s1, s11
|
|
; GFX8-NEXT: v_readfirstlane_b32 s28, v3
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s3
|
|
; GFX8-NEXT: v_readfirstlane_b32 s27, v5
|
|
; GFX8-NEXT: v_mul_hi_u32 v5, v3, s9
|
|
; GFX8-NEXT: s_cselect_b32 s26, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s24, s24, s22
|
|
; GFX8-NEXT: s_addc_u32 s23, s27, s23
|
|
; GFX8-NEXT: v_readfirstlane_b32 s29, v5
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, s4
|
|
; GFX8-NEXT: v_mul_hi_u32 v6, v5, s8
|
|
; GFX8-NEXT: s_mul_i32 s27, s2, s10
|
|
; GFX8-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s24, s27, s24
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s10
|
|
; GFX8-NEXT: s_addc_u32 s27, s28, s23
|
|
; GFX8-NEXT: s_mul_i32 s28, s3, s9
|
|
; GFX8-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s28, s28, s24
|
|
; GFX8-NEXT: v_readfirstlane_b32 s30, v6
|
|
; GFX8-NEXT: v_mul_hi_u32 v6, s16, v4
|
|
; GFX8-NEXT: s_addc_u32 s27, s29, s27
|
|
; GFX8-NEXT: s_mul_i32 s29, s4, s8
|
|
; GFX8-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s28, s29, s28
|
|
; GFX8-NEXT: v_readfirstlane_b32 s33, v0
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v2, s9
|
|
; GFX8-NEXT: s_addc_u32 s27, s30, s27
|
|
; GFX8-NEXT: s_mul_i32 s30, s16, s11
|
|
; GFX8-NEXT: s_cselect_b32 s29, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s31, v6
|
|
; GFX8-NEXT: s_add_u32 s19, s30, s19
|
|
; GFX8-NEXT: s_addc_u32 s28, s31, s28
|
|
; GFX8-NEXT: s_mul_i32 s31, s1, s10
|
|
; GFX8-NEXT: s_cselect_b32 s30, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s19, s31, s19
|
|
; GFX8-NEXT: v_readfirstlane_b32 s34, v0
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v3, s8
|
|
; GFX8-NEXT: s_addc_u32 s28, s33, s28
|
|
; GFX8-NEXT: s_mul_i32 s33, s2, s9
|
|
; GFX8-NEXT: s_cselect_b32 s31, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s19, s33, s19
|
|
; GFX8-NEXT: s_addc_u32 s28, s34, s28
|
|
; GFX8-NEXT: s_mul_i32 s34, s3, s8
|
|
; GFX8-NEXT: s_cselect_b32 s33, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX8-NEXT: s_add_u32 s19, s34, s19
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s14
|
|
; GFX8-NEXT: s_addc_u32 s28, s35, s28
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s16, v0
|
|
; GFX8-NEXT: s_cselect_b32 s34, 1, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s26, 0
|
|
; GFX8-NEXT: s_addc_u32 s19, s25, s19
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s13
|
|
; GFX8-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX8-NEXT: v_mul_hi_u32 v6, s1, v2
|
|
; GFX8-NEXT: s_addc_u32 s20, s20, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s26, v0
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v1
|
|
; GFX8-NEXT: s_cmp_lg_u32 s25, 0
|
|
; GFX8-NEXT: s_addc_u32 s20, s20, s28
|
|
; GFX8-NEXT: s_mul_i32 s25, s16, s14
|
|
; GFX8-NEXT: s_mul_i32 s28, s1, s13
|
|
; GFX8-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
|
|
; GFX8-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX8-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v3, s11
|
|
; GFX8-NEXT: s_mul_i32 s28, s2, s12
|
|
; GFX8-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX8-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v5, s10
|
|
; GFX8-NEXT: s_mul_i32 s28, s3, s11
|
|
; GFX8-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX8-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s5
|
|
; GFX8-NEXT: v_mul_hi_u32 v6, v0, s9
|
|
; GFX8-NEXT: s_mul_i32 s28, s4, s10
|
|
; GFX8-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, s1, v1
|
|
; GFX8-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, s6
|
|
; GFX8-NEXT: v_mul_hi_u32 v6, v6, s8
|
|
; GFX8-NEXT: s_mul_i32 s28, s5, s9
|
|
; GFX8-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX8-NEXT: v_mul_hi_u32 v2, s16, v2
|
|
; GFX8-NEXT: v_readfirstlane_b32 s36, v1
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, s2, v4
|
|
; GFX8-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX8-NEXT: s_mul_i32 s28, s6, s8
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v6
|
|
; GFX8-NEXT: s_add_u32 s25, s28, s25
|
|
; GFX8-NEXT: s_addc_u32 s26, s35, s26
|
|
; GFX8-NEXT: s_mul_i32 s28, s16, s13
|
|
; GFX8-NEXT: v_readfirstlane_b32 s35, v2
|
|
; GFX8-NEXT: s_add_u32 s27, s28, s27
|
|
; GFX8-NEXT: v_readfirstlane_b32 s37, v1
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, v3, s10
|
|
; GFX8-NEXT: s_addc_u32 s25, s35, s25
|
|
; GFX8-NEXT: s_mul_i32 s35, s1, s12
|
|
; GFX8-NEXT: s_cselect_b32 s28, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s27, s35, s27
|
|
; GFX8-NEXT: s_addc_u32 s25, s36, s25
|
|
; GFX8-NEXT: s_mul_i32 s36, s2, s11
|
|
; GFX8-NEXT: s_cselect_b32 s35, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s27, s36, s27
|
|
; GFX8-NEXT: v_readfirstlane_b32 s38, v1
|
|
; GFX8-NEXT: v_mul_hi_u32 v1, v5, s9
|
|
; GFX8-NEXT: s_addc_u32 s25, s37, s25
|
|
; GFX8-NEXT: s_mul_i32 s37, s3, s10
|
|
; GFX8-NEXT: s_cselect_b32 s36, 1, 0
|
|
; GFX8-NEXT: s_add_u32 s27, s37, s27
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, v0, s8
|
|
; GFX8-NEXT: s_addc_u32 s25, s38, s25
|
|
; GFX8-NEXT: s_mul_i32 s38, s4, s9
|
|
; GFX8-NEXT: s_cselect_b32 s37, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s39, v1
|
|
; GFX8-NEXT: s_add_u32 s27, s38, s27
|
|
; GFX8-NEXT: s_addc_u32 s25, s39, s25
|
|
; GFX8-NEXT: s_mul_i32 s39, s5, s8
|
|
; GFX8-NEXT: s_cselect_b32 s38, 1, 0
|
|
; GFX8-NEXT: v_readfirstlane_b32 s40, v0
|
|
; GFX8-NEXT: s_add_u32 s27, s39, s27
|
|
; GFX8-NEXT: s_addc_u32 s25, s40, s25
|
|
; GFX8-NEXT: s_cselect_b32 s39, 1, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s31, 0
|
|
; GFX8-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s33, 0
|
|
; GFX8-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s34, 0
|
|
; GFX8-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX8-NEXT: s_addc_u32 s21, s30, s27
|
|
; GFX8-NEXT: s_cselect_b32 s27, 1, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s23, 0
|
|
; GFX8-NEXT: s_addc_u32 s22, s22, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s24, 0
|
|
; GFX8-NEXT: s_addc_u32 s22, s22, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s29, 0
|
|
; GFX8-NEXT: s_addc_u32 s22, s22, 0
|
|
; GFX8-NEXT: s_cmp_lg_u32 s27, 0
|
|
; GFX8-NEXT: s_addc_u32 s22, s22, s25
|
|
; GFX8-NEXT: s_mul_i32 s16, s16, s15
|
|
; GFX8-NEXT: s_addc_u32 s15, s26, s16
|
|
; GFX8-NEXT: s_mul_i32 s1, s1, s14
|
|
; GFX8-NEXT: s_cmp_lg_u32 s39, 0
|
|
; GFX8-NEXT: s_addc_u32 s1, s15, s1
|
|
; GFX8-NEXT: s_mul_i32 s2, s2, s13
|
|
; GFX8-NEXT: s_cmp_lg_u32 s38, 0
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s2
|
|
; GFX8-NEXT: s_mul_i32 s3, s3, s12
|
|
; GFX8-NEXT: s_cmp_lg_u32 s37, 0
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX8-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX8-NEXT: s_cmp_lg_u32 s36, 0
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s4
|
|
; GFX8-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX8-NEXT: s_cmp_lg_u32 s35, 0
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX8-NEXT: s_mul_i32 s6, s6, s9
|
|
; GFX8-NEXT: s_cmp_lg_u32 s28, 0
|
|
; GFX8-NEXT: s_addc_u32 s1, s1, s6
|
|
; GFX8-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX8-NEXT: s_mul_i32 s0, s0, s8
|
|
; GFX8-NEXT: s_add_u32 s7, s7, s1
|
|
; GFX8-NEXT: s_mov_b32 s1, s18
|
|
; GFX8-NEXT: s_mov_b32 s2, s17
|
|
; GFX8-NEXT: s_mov_b32 s3, s19
|
|
; GFX8-NEXT: s_mov_b32 s4, s20
|
|
; GFX8-NEXT: s_mov_b32 s5, s21
|
|
; GFX8-NEXT: s_mov_b32 s6, s22
|
|
; GFX8-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX9-LABEL: s_mul_i256:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_mov_b32 s16, s0
|
|
; GFX9-NEXT: s_mul_i32 s18, s16, s10
|
|
; GFX9-NEXT: s_mul_i32 s20, s1, s9
|
|
; GFX9-NEXT: s_mul_hi_u32 s19, s16, s10
|
|
; GFX9-NEXT: s_mul_hi_u32 s21, s1, s9
|
|
; GFX9-NEXT: s_add_u32 s18, s20, s18
|
|
; GFX9-NEXT: s_addc_u32 s19, s21, s19
|
|
; GFX9-NEXT: s_mul_i32 s21, s2, s8
|
|
; GFX9-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s22, s2, s8
|
|
; GFX9-NEXT: s_add_u32 s18, s21, s18
|
|
; GFX9-NEXT: s_mul_hi_u32 s17, s16, s8
|
|
; GFX9-NEXT: s_addc_u32 s19, s22, s19
|
|
; GFX9-NEXT: s_mul_i32 s22, s16, s9
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s23, s16, s9
|
|
; GFX9-NEXT: s_add_u32 s17, s22, s17
|
|
; GFX9-NEXT: s_addc_u32 s18, s23, s18
|
|
; GFX9-NEXT: s_mul_i32 s23, s1, s8
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s24, s1, s8
|
|
; GFX9-NEXT: s_add_u32 s17, s23, s17
|
|
; GFX9-NEXT: s_addc_u32 s18, s24, s18
|
|
; GFX9-NEXT: s_mul_i32 s24, s16, s12
|
|
; GFX9-NEXT: s_mul_i32 s26, s1, s11
|
|
; GFX9-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s25, s16, s12
|
|
; GFX9-NEXT: s_mul_hi_u32 s27, s1, s11
|
|
; GFX9-NEXT: s_add_u32 s24, s26, s24
|
|
; GFX9-NEXT: s_addc_u32 s25, s27, s25
|
|
; GFX9-NEXT: s_mul_i32 s27, s2, s10
|
|
; GFX9-NEXT: s_cselect_b32 s26, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s28, s2, s10
|
|
; GFX9-NEXT: s_add_u32 s24, s27, s24
|
|
; GFX9-NEXT: s_addc_u32 s25, s28, s25
|
|
; GFX9-NEXT: s_mul_i32 s28, s3, s9
|
|
; GFX9-NEXT: s_cselect_b32 s27, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s29, s3, s9
|
|
; GFX9-NEXT: s_add_u32 s24, s28, s24
|
|
; GFX9-NEXT: s_addc_u32 s25, s29, s25
|
|
; GFX9-NEXT: s_mul_i32 s29, s4, s8
|
|
; GFX9-NEXT: s_cselect_b32 s28, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s30, s4, s8
|
|
; GFX9-NEXT: s_add_u32 s24, s29, s24
|
|
; GFX9-NEXT: s_addc_u32 s25, s30, s25
|
|
; GFX9-NEXT: s_mul_i32 s30, s16, s11
|
|
; GFX9-NEXT: s_cselect_b32 s29, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s31, s16, s11
|
|
; GFX9-NEXT: s_add_u32 s19, s30, s19
|
|
; GFX9-NEXT: s_addc_u32 s24, s31, s24
|
|
; GFX9-NEXT: s_mul_i32 s31, s1, s10
|
|
; GFX9-NEXT: s_cselect_b32 s30, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s33, s1, s10
|
|
; GFX9-NEXT: s_add_u32 s19, s31, s19
|
|
; GFX9-NEXT: s_addc_u32 s24, s33, s24
|
|
; GFX9-NEXT: s_mul_i32 s33, s2, s9
|
|
; GFX9-NEXT: s_cselect_b32 s31, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s34, s2, s9
|
|
; GFX9-NEXT: s_add_u32 s19, s33, s19
|
|
; GFX9-NEXT: s_addc_u32 s24, s34, s24
|
|
; GFX9-NEXT: s_mul_i32 s34, s3, s8
|
|
; GFX9-NEXT: s_cselect_b32 s33, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s3, s8
|
|
; GFX9-NEXT: s_add_u32 s19, s34, s19
|
|
; GFX9-NEXT: s_addc_u32 s24, s35, s24
|
|
; GFX9-NEXT: s_cselect_b32 s34, 1, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s23, 0
|
|
; GFX9-NEXT: s_addc_u32 s19, s22, s19
|
|
; GFX9-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX9-NEXT: s_addc_u32 s20, s20, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s22, 0
|
|
; GFX9-NEXT: s_addc_u32 s20, s20, s24
|
|
; GFX9-NEXT: s_mul_i32 s22, s16, s14
|
|
; GFX9-NEXT: s_mul_i32 s24, s1, s13
|
|
; GFX9-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s23, s16, s14
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s1, s13
|
|
; GFX9-NEXT: s_add_u32 s22, s24, s22
|
|
; GFX9-NEXT: s_addc_u32 s23, s35, s23
|
|
; GFX9-NEXT: s_mul_i32 s24, s2, s12
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s2, s12
|
|
; GFX9-NEXT: s_add_u32 s22, s24, s22
|
|
; GFX9-NEXT: s_addc_u32 s23, s35, s23
|
|
; GFX9-NEXT: s_mul_i32 s24, s3, s11
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s3, s11
|
|
; GFX9-NEXT: s_add_u32 s22, s24, s22
|
|
; GFX9-NEXT: s_addc_u32 s23, s35, s23
|
|
; GFX9-NEXT: s_mul_i32 s24, s4, s10
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s4, s10
|
|
; GFX9-NEXT: s_add_u32 s22, s24, s22
|
|
; GFX9-NEXT: s_addc_u32 s23, s35, s23
|
|
; GFX9-NEXT: s_mul_i32 s24, s5, s9
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s5, s9
|
|
; GFX9-NEXT: s_add_u32 s22, s24, s22
|
|
; GFX9-NEXT: s_addc_u32 s23, s35, s23
|
|
; GFX9-NEXT: s_mul_i32 s24, s6, s8
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s6, s8
|
|
; GFX9-NEXT: s_add_u32 s22, s24, s22
|
|
; GFX9-NEXT: s_addc_u32 s23, s35, s23
|
|
; GFX9-NEXT: s_mul_i32 s24, s16, s13
|
|
; GFX9-NEXT: s_mul_hi_u32 s35, s16, s13
|
|
; GFX9-NEXT: s_add_u32 s24, s24, s25
|
|
; GFX9-NEXT: s_addc_u32 s22, s35, s22
|
|
; GFX9-NEXT: s_mul_i32 s35, s1, s12
|
|
; GFX9-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s36, s1, s12
|
|
; GFX9-NEXT: s_add_u32 s24, s35, s24
|
|
; GFX9-NEXT: s_addc_u32 s22, s36, s22
|
|
; GFX9-NEXT: s_mul_i32 s36, s2, s11
|
|
; GFX9-NEXT: s_cselect_b32 s35, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s37, s2, s11
|
|
; GFX9-NEXT: s_add_u32 s24, s36, s24
|
|
; GFX9-NEXT: s_addc_u32 s22, s37, s22
|
|
; GFX9-NEXT: s_mul_i32 s37, s3, s10
|
|
; GFX9-NEXT: s_cselect_b32 s36, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s38, s3, s10
|
|
; GFX9-NEXT: s_add_u32 s24, s37, s24
|
|
; GFX9-NEXT: s_addc_u32 s22, s38, s22
|
|
; GFX9-NEXT: s_mul_i32 s38, s4, s9
|
|
; GFX9-NEXT: s_cselect_b32 s37, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s39, s4, s9
|
|
; GFX9-NEXT: s_add_u32 s24, s38, s24
|
|
; GFX9-NEXT: s_addc_u32 s22, s39, s22
|
|
; GFX9-NEXT: s_mul_i32 s39, s5, s8
|
|
; GFX9-NEXT: s_cselect_b32 s38, 1, 0
|
|
; GFX9-NEXT: s_mul_hi_u32 s40, s5, s8
|
|
; GFX9-NEXT: s_add_u32 s24, s39, s24
|
|
; GFX9-NEXT: s_addc_u32 s22, s40, s22
|
|
; GFX9-NEXT: s_cselect_b32 s39, 1, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s31, 0
|
|
; GFX9-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s33, 0
|
|
; GFX9-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s34, 0
|
|
; GFX9-NEXT: s_addc_u32 s30, s30, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX9-NEXT: s_addc_u32 s21, s30, s24
|
|
; GFX9-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s27, 0
|
|
; GFX9-NEXT: s_addc_u32 s26, s26, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s28, 0
|
|
; GFX9-NEXT: s_addc_u32 s26, s26, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s29, 0
|
|
; GFX9-NEXT: s_addc_u32 s26, s26, 0
|
|
; GFX9-NEXT: s_cmp_lg_u32 s24, 0
|
|
; GFX9-NEXT: s_addc_u32 s22, s26, s22
|
|
; GFX9-NEXT: s_mul_i32 s16, s16, s15
|
|
; GFX9-NEXT: s_addc_u32 s15, s23, s16
|
|
; GFX9-NEXT: s_mul_i32 s1, s1, s14
|
|
; GFX9-NEXT: s_cmp_lg_u32 s39, 0
|
|
; GFX9-NEXT: s_addc_u32 s1, s15, s1
|
|
; GFX9-NEXT: s_mul_i32 s2, s2, s13
|
|
; GFX9-NEXT: s_cmp_lg_u32 s38, 0
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s2
|
|
; GFX9-NEXT: s_mul_i32 s3, s3, s12
|
|
; GFX9-NEXT: s_cmp_lg_u32 s37, 0
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX9-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX9-NEXT: s_cmp_lg_u32 s36, 0
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s4
|
|
; GFX9-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX9-NEXT: s_cmp_lg_u32 s35, 0
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX9-NEXT: s_mul_i32 s6, s6, s9
|
|
; GFX9-NEXT: s_cmp_lg_u32 s25, 0
|
|
; GFX9-NEXT: s_addc_u32 s1, s1, s6
|
|
; GFX9-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX9-NEXT: s_mul_i32 s0, s0, s8
|
|
; GFX9-NEXT: s_add_u32 s7, s7, s1
|
|
; GFX9-NEXT: s_mov_b32 s1, s17
|
|
; GFX9-NEXT: s_mov_b32 s2, s18
|
|
; GFX9-NEXT: s_mov_b32 s3, s19
|
|
; GFX9-NEXT: s_mov_b32 s4, s20
|
|
; GFX9-NEXT: s_mov_b32 s5, s21
|
|
; GFX9-NEXT: s_mov_b32 s6, s22
|
|
; GFX9-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX10PLUS-LABEL: s_mul_i256:
|
|
; GFX10PLUS: ; %bb.0:
|
|
; GFX10PLUS-NEXT: s_mul_i32 s17, s0, s10
|
|
; GFX10PLUS-NEXT: s_mul_i32 s19, s1, s9
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s18, s0, s10
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s20, s1, s9
|
|
; GFX10PLUS-NEXT: s_add_u32 s17, s19, s17
|
|
; GFX10PLUS-NEXT: s_addc_u32 s18, s20, s18
|
|
; GFX10PLUS-NEXT: s_mul_i32 s20, s2, s8
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s21, s2, s8
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s17, s20, s17
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s16, s0, s8
|
|
; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18
|
|
; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s9
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s9
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s16, s21, s16
|
|
; GFX10PLUS-NEXT: s_addc_u32 s17, s22, s17
|
|
; GFX10PLUS-NEXT: s_mul_i32 s22, s1, s8
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s23, s1, s8
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s16, s22, s16
|
|
; GFX10PLUS-NEXT: s_addc_u32 s17, s23, s17
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s12
|
|
; GFX10PLUS-NEXT: s_mul_i32 s25, s1, s11
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s24, s0, s12
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s26, s1, s11
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s25, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s24, s26, s24
|
|
; GFX10PLUS-NEXT: s_mul_i32 s26, s2, s10
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s27, s2, s10
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s26, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s24, s27, s24
|
|
; GFX10PLUS-NEXT: s_mul_i32 s27, s3, s9
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s28, s3, s9
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s26, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s27, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s24, s28, s24
|
|
; GFX10PLUS-NEXT: s_mul_i32 s28, s4, s8
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s29, s4, s8
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s27, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s28, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s24, s29, s24
|
|
; GFX10PLUS-NEXT: s_mul_i32 s29, s0, s11
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s30, s0, s11
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s28, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s18, s29, s18
|
|
; GFX10PLUS-NEXT: s_addc_u32 s23, s30, s23
|
|
; GFX10PLUS-NEXT: s_mul_i32 s30, s1, s10
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s31, s1, s10
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s29, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s18, s30, s18
|
|
; GFX10PLUS-NEXT: s_addc_u32 s23, s31, s23
|
|
; GFX10PLUS-NEXT: s_mul_i32 s31, s2, s9
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s33, s2, s9
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s30, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s18, s31, s18
|
|
; GFX10PLUS-NEXT: s_addc_u32 s23, s33, s23
|
|
; GFX10PLUS-NEXT: s_mul_i32 s33, s3, s8
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s8
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s31, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s18, s33, s18
|
|
; GFX10PLUS-NEXT: s_addc_u32 s23, s34, s23
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s33, 1, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s22, 0
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s22, s0, s14
|
|
; GFX10PLUS-NEXT: s_addc_u32 s18, s21, s18
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s1, s13
|
|
; GFX10PLUS-NEXT: s_addc_u32 s19, s19, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s21, s0, s14
|
|
; GFX10PLUS-NEXT: s_addc_u32 s19, s19, s23
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s1, s13
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s2, s12
|
|
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s2, s12
|
|
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s3, s11
|
|
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s3, s11
|
|
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s4, s10
|
|
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s4, s10
|
|
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s5, s9
|
|
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s5, s9
|
|
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s6, s8
|
|
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s6, s8
|
|
; GFX10PLUS-NEXT: s_add_u32 s21, s23, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s23, s0, s13
|
|
; GFX10PLUS-NEXT: s_addc_u32 s22, s34, s22
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s34, s0, s13
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s23, s24
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s34, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s34, s1, s12
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s35, s1, s12
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s34, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s35, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s35, s2, s11
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s36, s2, s11
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s34, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s35, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s36, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s36, s3, s10
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s37, s3, s10
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s35, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s36, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s37, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s37, s4, s9
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s38, s4, s9
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s36, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s37, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s38, s21
|
|
; GFX10PLUS-NEXT: s_mul_i32 s38, s5, s8
|
|
; GFX10PLUS-NEXT: s_mul_hi_u32 s39, s5, s8
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s37, 1, 0
|
|
; GFX10PLUS-NEXT: s_add_u32 s23, s38, s23
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s39, s21
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s38, 1, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s30, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s1, s1, s14
|
|
; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s31, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s2, s2, s13
|
|
; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s33, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s3, s3, s12
|
|
; GFX10PLUS-NEXT: s_addc_u32 s29, s29, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX10PLUS-NEXT: s_addc_u32 s20, s29, s23
|
|
; GFX10PLUS-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s26, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s26, s0, s15
|
|
; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s27, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s28, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s6, s6, s9
|
|
; GFX10PLUS-NEXT: s_addc_u32 s25, s25, 0
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s23, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX10PLUS-NEXT: s_addc_u32 s15, s25, s21
|
|
; GFX10PLUS-NEXT: s_addc_u32 s21, s22, s26
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s38, 0
|
|
; GFX10PLUS-NEXT: s_mul_i32 s0, s0, s8
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s21, s1
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s37, 0
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s2
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s36, 0
|
|
; GFX10PLUS-NEXT: s_mov_b32 s2, s17
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s3
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s35, 0
|
|
; GFX10PLUS-NEXT: s_mov_b32 s3, s18
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s4
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s34, 0
|
|
; GFX10PLUS-NEXT: s_mov_b32 s4, s19
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s5
|
|
; GFX10PLUS-NEXT: s_cmp_lg_u32 s24, 0
|
|
; GFX10PLUS-NEXT: s_mov_b32 s5, s20
|
|
; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s6
|
|
; GFX10PLUS-NEXT: s_mov_b32 s6, s15
|
|
; GFX10PLUS-NEXT: s_add_i32 s7, s1, s7
|
|
; GFX10PLUS-NEXT: s_mov_b32 s1, s16
|
|
; GFX10PLUS-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX12-LABEL: s_mul_i256:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_mul_i32 s17, s0, s10
|
|
; GFX12-NEXT: s_mul_i32 s19, s1, s9
|
|
; GFX12-NEXT: s_mul_hi_u32 s18, s0, s10
|
|
; GFX12-NEXT: s_mul_hi_u32 s20, s1, s9
|
|
; GFX12-NEXT: s_add_co_u32 s17, s19, s17
|
|
; GFX12-NEXT: s_add_co_ci_u32 s18, s20, s18
|
|
; GFX12-NEXT: s_mul_i32 s20, s2, s8
|
|
; GFX12-NEXT: s_mul_hi_u32 s21, s2, s8
|
|
; GFX12-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s17, s20, s17
|
|
; GFX12-NEXT: s_mul_hi_u32 s16, s0, s8
|
|
; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18
|
|
; GFX12-NEXT: s_mul_i32 s21, s0, s9
|
|
; GFX12-NEXT: s_mul_hi_u32 s22, s0, s9
|
|
; GFX12-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s16, s21, s16
|
|
; GFX12-NEXT: s_add_co_ci_u32 s17, s22, s17
|
|
; GFX12-NEXT: s_mul_i32 s22, s1, s8
|
|
; GFX12-NEXT: s_mul_hi_u32 s23, s1, s8
|
|
; GFX12-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s16, s22, s16
|
|
; GFX12-NEXT: s_add_co_ci_u32 s17, s23, s17
|
|
; GFX12-NEXT: s_mul_i32 s23, s0, s12
|
|
; GFX12-NEXT: s_mul_i32 s25, s1, s11
|
|
; GFX12-NEXT: s_mul_hi_u32 s24, s0, s12
|
|
; GFX12-NEXT: s_mul_hi_u32 s26, s1, s11
|
|
; GFX12-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s25, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s24, s26, s24
|
|
; GFX12-NEXT: s_mul_i32 s26, s2, s10
|
|
; GFX12-NEXT: s_mul_hi_u32 s27, s2, s10
|
|
; GFX12-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s26, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s24, s27, s24
|
|
; GFX12-NEXT: s_mul_i32 s27, s3, s9
|
|
; GFX12-NEXT: s_mul_hi_u32 s28, s3, s9
|
|
; GFX12-NEXT: s_cselect_b32 s26, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s27, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s24, s28, s24
|
|
; GFX12-NEXT: s_mul_i32 s28, s4, s8
|
|
; GFX12-NEXT: s_mul_hi_u32 s29, s4, s8
|
|
; GFX12-NEXT: s_cselect_b32 s27, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s28, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s24, s29, s24
|
|
; GFX12-NEXT: s_mul_i32 s29, s0, s11
|
|
; GFX12-NEXT: s_mul_hi_u32 s30, s0, s11
|
|
; GFX12-NEXT: s_cselect_b32 s28, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s18, s29, s18
|
|
; GFX12-NEXT: s_add_co_ci_u32 s23, s30, s23
|
|
; GFX12-NEXT: s_mul_i32 s30, s1, s10
|
|
; GFX12-NEXT: s_mul_hi_u32 s31, s1, s10
|
|
; GFX12-NEXT: s_cselect_b32 s29, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s18, s30, s18
|
|
; GFX12-NEXT: s_add_co_ci_u32 s23, s31, s23
|
|
; GFX12-NEXT: s_mul_i32 s31, s2, s9
|
|
; GFX12-NEXT: s_mul_hi_u32 s33, s2, s9
|
|
; GFX12-NEXT: s_cselect_b32 s30, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s18, s31, s18
|
|
; GFX12-NEXT: s_add_co_ci_u32 s23, s33, s23
|
|
; GFX12-NEXT: s_mul_i32 s33, s3, s8
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s3, s8
|
|
; GFX12-NEXT: s_cselect_b32 s31, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s18, s33, s18
|
|
; GFX12-NEXT: s_add_co_ci_u32 s23, s34, s23
|
|
; GFX12-NEXT: s_cselect_b32 s33, 1, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s22, 0
|
|
; GFX12-NEXT: s_mul_hi_u32 s22, s0, s14
|
|
; GFX12-NEXT: s_add_co_ci_u32 s18, s21, s18
|
|
; GFX12-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s20, 0
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s1, s13
|
|
; GFX12-NEXT: s_add_co_ci_u32 s19, s19, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX12-NEXT: s_mul_i32 s21, s0, s14
|
|
; GFX12-NEXT: s_add_co_ci_u32 s19, s19, s23
|
|
; GFX12-NEXT: s_mul_i32 s23, s1, s13
|
|
; GFX12-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX12-NEXT: s_mul_i32 s23, s2, s12
|
|
; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s2, s12
|
|
; GFX12-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX12-NEXT: s_mul_i32 s23, s3, s11
|
|
; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s3, s11
|
|
; GFX12-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX12-NEXT: s_mul_i32 s23, s4, s10
|
|
; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s4, s10
|
|
; GFX12-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX12-NEXT: s_mul_i32 s23, s5, s9
|
|
; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s5, s9
|
|
; GFX12-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX12-NEXT: s_mul_i32 s23, s6, s8
|
|
; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s6, s8
|
|
; GFX12-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX12-NEXT: s_mul_i32 s23, s0, s13
|
|
; GFX12-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX12-NEXT: s_mul_hi_u32 s34, s0, s13
|
|
; GFX12-NEXT: s_add_co_u32 s23, s23, s24
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s34, s21
|
|
; GFX12-NEXT: s_mul_i32 s34, s1, s12
|
|
; GFX12-NEXT: s_mul_hi_u32 s35, s1, s12
|
|
; GFX12-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s34, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s35, s21
|
|
; GFX12-NEXT: s_mul_i32 s35, s2, s11
|
|
; GFX12-NEXT: s_mul_hi_u32 s36, s2, s11
|
|
; GFX12-NEXT: s_cselect_b32 s34, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s35, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s36, s21
|
|
; GFX12-NEXT: s_mul_i32 s36, s3, s10
|
|
; GFX12-NEXT: s_mul_hi_u32 s37, s3, s10
|
|
; GFX12-NEXT: s_cselect_b32 s35, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s36, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s37, s21
|
|
; GFX12-NEXT: s_mul_i32 s37, s4, s9
|
|
; GFX12-NEXT: s_mul_hi_u32 s38, s4, s9
|
|
; GFX12-NEXT: s_cselect_b32 s36, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s37, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s38, s21
|
|
; GFX12-NEXT: s_mul_i32 s38, s5, s8
|
|
; GFX12-NEXT: s_mul_hi_u32 s39, s5, s8
|
|
; GFX12-NEXT: s_cselect_b32 s37, 1, 0
|
|
; GFX12-NEXT: s_add_co_u32 s23, s38, s23
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s39, s21
|
|
; GFX12-NEXT: s_cselect_b32 s38, 1, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s30, 0
|
|
; GFX12-NEXT: s_mul_i32 s1, s1, s14
|
|
; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s31, 0
|
|
; GFX12-NEXT: s_mul_i32 s2, s2, s13
|
|
; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s33, 0
|
|
; GFX12-NEXT: s_mul_i32 s3, s3, s12
|
|
; GFX12-NEXT: s_add_co_ci_u32 s29, s29, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s20, 0
|
|
; GFX12-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX12-NEXT: s_add_co_ci_u32 s20, s29, s23
|
|
; GFX12-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s26, 0
|
|
; GFX12-NEXT: s_mul_i32 s26, s0, s15
|
|
; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s27, 0
|
|
; GFX12-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s28, 0
|
|
; GFX12-NEXT: s_mul_i32 s6, s6, s9
|
|
; GFX12-NEXT: s_add_co_ci_u32 s25, s25, 0
|
|
; GFX12-NEXT: s_cmp_lg_u32 s23, 0
|
|
; GFX12-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX12-NEXT: s_add_co_ci_u32 s15, s25, s21
|
|
; GFX12-NEXT: s_add_co_ci_u32 s21, s22, s26
|
|
; GFX12-NEXT: s_cmp_lg_u32 s38, 0
|
|
; GFX12-NEXT: s_mul_i32 s0, s0, s8
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s21, s1
|
|
; GFX12-NEXT: s_cmp_lg_u32 s37, 0
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s2
|
|
; GFX12-NEXT: s_cmp_lg_u32 s36, 0
|
|
; GFX12-NEXT: s_mov_b32 s2, s17
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s3
|
|
; GFX12-NEXT: s_cmp_lg_u32 s35, 0
|
|
; GFX12-NEXT: s_mov_b32 s3, s18
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s4
|
|
; GFX12-NEXT: s_cmp_lg_u32 s34, 0
|
|
; GFX12-NEXT: s_mov_b32 s4, s19
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s5
|
|
; GFX12-NEXT: s_cmp_lg_u32 s24, 0
|
|
; GFX12-NEXT: s_mov_b32 s5, s20
|
|
; GFX12-NEXT: s_add_co_ci_u32 s1, s1, s6
|
|
; GFX12-NEXT: s_mov_b32 s6, s15
|
|
; GFX12-NEXT: s_add_co_i32 s7, s1, s7
|
|
; GFX12-NEXT: s_mov_b32 s1, s16
|
|
; GFX12-NEXT: ; return to shader part epilog
|
|
;
|
|
; GFX1250-LABEL: s_mul_i256:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_mul_i32 s17, s0, s10
|
|
; GFX1250-NEXT: s_mul_i32 s19, s1, s9
|
|
; GFX1250-NEXT: s_mul_hi_u32 s18, s0, s10
|
|
; GFX1250-NEXT: s_mul_hi_u32 s20, s1, s9
|
|
; GFX1250-NEXT: s_add_co_u32 s17, s19, s17
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s18, s20, s18
|
|
; GFX1250-NEXT: s_mul_i32 s20, s2, s8
|
|
; GFX1250-NEXT: s_mul_hi_u32 s21, s2, s8
|
|
; GFX1250-NEXT: s_cselect_b32 s19, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s17, s20, s17
|
|
; GFX1250-NEXT: s_mul_hi_u32 s16, s0, s8
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18
|
|
; GFX1250-NEXT: s_mul_i32 s21, s0, s9
|
|
; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s9
|
|
; GFX1250-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s16, s21, s16
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s17, s22, s17
|
|
; GFX1250-NEXT: s_mul_i32 s22, s1, s8
|
|
; GFX1250-NEXT: s_mul_hi_u32 s23, s1, s8
|
|
; GFX1250-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s16, s22, s16
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s17, s23, s17
|
|
; GFX1250-NEXT: s_mul_i32 s23, s0, s12
|
|
; GFX1250-NEXT: s_mul_i32 s25, s1, s11
|
|
; GFX1250-NEXT: s_mul_hi_u32 s24, s0, s12
|
|
; GFX1250-NEXT: s_mul_hi_u32 s26, s1, s11
|
|
; GFX1250-NEXT: s_cselect_b32 s22, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s25, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s24, s26, s24
|
|
; GFX1250-NEXT: s_mul_i32 s26, s2, s10
|
|
; GFX1250-NEXT: s_mul_hi_u32 s27, s2, s10
|
|
; GFX1250-NEXT: s_cselect_b32 s25, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s26, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s24, s27, s24
|
|
; GFX1250-NEXT: s_mul_i32 s27, s3, s9
|
|
; GFX1250-NEXT: s_mul_hi_u32 s28, s3, s9
|
|
; GFX1250-NEXT: s_cselect_b32 s26, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s27, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s24, s28, s24
|
|
; GFX1250-NEXT: s_mul_i32 s28, s4, s8
|
|
; GFX1250-NEXT: s_mul_hi_u32 s29, s4, s8
|
|
; GFX1250-NEXT: s_cselect_b32 s27, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s28, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s24, s29, s24
|
|
; GFX1250-NEXT: s_mul_i32 s29, s0, s11
|
|
; GFX1250-NEXT: s_mul_hi_u32 s30, s0, s11
|
|
; GFX1250-NEXT: s_cselect_b32 s28, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s18, s29, s18
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s23, s30, s23
|
|
; GFX1250-NEXT: s_mul_i32 s30, s1, s10
|
|
; GFX1250-NEXT: s_mul_hi_u32 s31, s1, s10
|
|
; GFX1250-NEXT: s_cselect_b32 s29, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s18, s30, s18
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s23, s31, s23
|
|
; GFX1250-NEXT: s_mul_i32 s31, s2, s9
|
|
; GFX1250-NEXT: s_mul_hi_u32 s33, s2, s9
|
|
; GFX1250-NEXT: s_cselect_b32 s30, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s18, s31, s18
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s23, s33, s23
|
|
; GFX1250-NEXT: s_mul_i32 s33, s3, s8
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s8
|
|
; GFX1250-NEXT: s_cselect_b32 s31, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s18, s33, s18
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s23, s34, s23
|
|
; GFX1250-NEXT: s_cselect_b32 s33, 1, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s22, 0
|
|
; GFX1250-NEXT: s_mul_hi_u32 s22, s0, s14
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s18, s21, s18
|
|
; GFX1250-NEXT: s_cselect_b32 s21, 1, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s20, 0
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s1, s13
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s21, 0
|
|
; GFX1250-NEXT: s_mul_i32 s21, s0, s14
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s19, s19, s23
|
|
; GFX1250-NEXT: s_mul_i32 s23, s1, s13
|
|
; GFX1250-NEXT: s_cselect_b32 s20, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX1250-NEXT: s_mul_i32 s23, s2, s12
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s2, s12
|
|
; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX1250-NEXT: s_mul_i32 s23, s3, s11
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s3, s11
|
|
; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX1250-NEXT: s_mul_i32 s23, s4, s10
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s4, s10
|
|
; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX1250-NEXT: s_mul_i32 s23, s5, s9
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s5, s9
|
|
; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX1250-NEXT: s_mul_i32 s23, s6, s8
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s6, s8
|
|
; GFX1250-NEXT: s_add_co_u32 s21, s23, s21
|
|
; GFX1250-NEXT: s_mul_i32 s23, s0, s13
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s22, s34, s22
|
|
; GFX1250-NEXT: s_mul_hi_u32 s34, s0, s13
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s23, s24
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s34, s21
|
|
; GFX1250-NEXT: s_mul_i32 s34, s1, s12
|
|
; GFX1250-NEXT: s_mul_hi_u32 s35, s1, s12
|
|
; GFX1250-NEXT: s_cselect_b32 s24, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s34, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s35, s21
|
|
; GFX1250-NEXT: s_mul_i32 s35, s2, s11
|
|
; GFX1250-NEXT: s_mul_hi_u32 s36, s2, s11
|
|
; GFX1250-NEXT: s_cselect_b32 s34, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s35, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s36, s21
|
|
; GFX1250-NEXT: s_mul_i32 s36, s3, s10
|
|
; GFX1250-NEXT: s_mul_hi_u32 s37, s3, s10
|
|
; GFX1250-NEXT: s_cselect_b32 s35, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s36, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s37, s21
|
|
; GFX1250-NEXT: s_mul_i32 s37, s4, s9
|
|
; GFX1250-NEXT: s_mul_hi_u32 s38, s4, s9
|
|
; GFX1250-NEXT: s_cselect_b32 s36, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s37, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s38, s21
|
|
; GFX1250-NEXT: s_mul_i32 s38, s5, s8
|
|
; GFX1250-NEXT: s_mul_hi_u32 s39, s5, s8
|
|
; GFX1250-NEXT: s_cselect_b32 s37, 1, 0
|
|
; GFX1250-NEXT: s_add_co_u32 s23, s38, s23
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s39, s21
|
|
; GFX1250-NEXT: s_cselect_b32 s38, 1, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s30, 0
|
|
; GFX1250-NEXT: s_mul_i32 s1, s1, s14
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s31, 0
|
|
; GFX1250-NEXT: s_mul_i32 s2, s2, s13
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s33, 0
|
|
; GFX1250-NEXT: s_mul_i32 s3, s3, s12
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s29, s29, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s20, 0
|
|
; GFX1250-NEXT: s_mul_i32 s4, s4, s11
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s20, s29, s23
|
|
; GFX1250-NEXT: s_cselect_b32 s23, 1, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s26, 0
|
|
; GFX1250-NEXT: s_mul_i32 s26, s0, s15
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s27, 0
|
|
; GFX1250-NEXT: s_mul_i32 s5, s5, s10
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s28, 0
|
|
; GFX1250-NEXT: s_mul_i32 s6, s6, s9
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s25, s25, 0
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s23, 0
|
|
; GFX1250-NEXT: s_mul_i32 s7, s7, s8
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s15, s25, s21
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s21, s22, s26
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s38, 0
|
|
; GFX1250-NEXT: s_mul_i32 s0, s0, s8
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s21, s1
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s37, 0
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s2
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s36, 0
|
|
; GFX1250-NEXT: s_mov_b32 s2, s17
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s3
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s35, 0
|
|
; GFX1250-NEXT: s_mov_b32 s3, s18
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s4
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s34, 0
|
|
; GFX1250-NEXT: s_mov_b32 s4, s19
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s5
|
|
; GFX1250-NEXT: s_cmp_lg_u32 s24, 0
|
|
; GFX1250-NEXT: s_mov_b32 s5, s20
|
|
; GFX1250-NEXT: s_add_co_ci_u32 s1, s1, s6
|
|
; GFX1250-NEXT: s_mov_b32 s6, s15
|
|
; GFX1250-NEXT: s_add_co_i32 s7, s1, s7
|
|
; GFX1250-NEXT: s_mov_b32 s1, s16
|
|
; GFX1250-NEXT: ; return to shader part epilog
|
|
%result = mul i256 %num, %den
|
|
%cast = bitcast i256 %result to <8 x i32>
|
|
ret <8 x i32> %cast
|
|
}
|
|
|
|
define i256 @v_mul_i256(i256 %num, i256 %den) {
|
|
; GFX7-LABEL: v_mul_i256:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
|
|
; GFX7-NEXT: v_mul_lo_u32 v28, v4, v11
|
|
; GFX7-NEXT: v_mul_lo_u32 v29, v3, v12
|
|
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
|
|
; GFX7-NEXT: v_mul_lo_u32 v30, v2, v13
|
|
; GFX7-NEXT: v_mul_lo_u32 v27, v5, v10
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0
|
|
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
|
|
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21]
|
|
; GFX7-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v24, s[4:5]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17]
|
|
; GFX7-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v20, s[4:5]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25]
|
|
; GFX7-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v16, s[4:5]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22]
|
|
; GFX7-NEXT: v_mul_lo_u32 v25, v6, v9
|
|
; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22]
|
|
; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v26, vcc
|
|
; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
|
|
; GFX7-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
|
|
; GFX7-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
|
|
; GFX7-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13]
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
|
|
; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18]
|
|
; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17]
|
|
; GFX7-NEXT: v_mul_lo_u32 v0, v0, v15
|
|
; GFX7-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3]
|
|
; GFX7-NEXT: v_addc_u32_e32 v3, vcc, v6, v19, vcc
|
|
; GFX7-NEXT: v_mul_lo_u32 v10, v1, v14
|
|
; GFX7-NEXT: v_addc_u32_e32 v4, vcc, v4, v20, vcc
|
|
; GFX7-NEXT: v_addc_u32_e32 v5, vcc, v5, v21, vcc
|
|
; GFX7-NEXT: v_addc_u32_e32 v6, vcc, v24, v22, vcc
|
|
; GFX7-NEXT: v_addc_u32_e32 v0, vcc, v23, v0, vcc
|
|
; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v10, s[14:15]
|
|
; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v30, s[12:13]
|
|
; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v29, s[10:11]
|
|
; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v28, s[8:9]
|
|
; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v27, s[6:7]
|
|
; GFX7-NEXT: v_addc_u32_e64 v0, vcc, v0, v25, s[4:5]
|
|
; GFX7-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, v16
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, v11
|
|
; GFX7-NEXT: v_mov_b32_e32 v2, v12
|
|
; GFX7-NEXT: v_mov_b32_e32 v7, v9
|
|
; GFX7-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX8-LABEL: v_mul_i256:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
|
|
; GFX8-NEXT: v_mul_lo_u32 v28, v4, v11
|
|
; GFX8-NEXT: v_mul_lo_u32 v29, v3, v12
|
|
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
|
|
; GFX8-NEXT: v_mul_lo_u32 v30, v2, v13
|
|
; GFX8-NEXT: v_mul_lo_u32 v27, v5, v10
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0
|
|
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21]
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
|
|
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17]
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21]
|
|
; GFX8-NEXT: v_addc_u32_e64 v20, s[4:5], 0, v24, s[4:5]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17]
|
|
; GFX8-NEXT: v_addc_u32_e64 v16, s[4:5], 0, v20, s[4:5]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25]
|
|
; GFX8-NEXT: v_addc_u32_e64 v24, s[4:5], 0, v16, s[4:5]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22]
|
|
; GFX8-NEXT: v_mul_lo_u32 v25, v6, v9
|
|
; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22]
|
|
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v26, vcc
|
|
; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17]
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
|
|
; GFX8-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
|
|
; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v6, vcc
|
|
; GFX8-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13]
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v2, vcc
|
|
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18]
|
|
; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17]
|
|
; GFX8-NEXT: v_mul_lo_u32 v0, v0, v15
|
|
; GFX8-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3]
|
|
; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v6, v19, vcc
|
|
; GFX8-NEXT: v_mul_lo_u32 v10, v1, v14
|
|
; GFX8-NEXT: v_addc_u32_e32 v4, vcc, v4, v20, vcc
|
|
; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v21, vcc
|
|
; GFX8-NEXT: v_addc_u32_e32 v6, vcc, v24, v22, vcc
|
|
; GFX8-NEXT: v_addc_u32_e32 v0, vcc, v23, v0, vcc
|
|
; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v10, s[14:15]
|
|
; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v30, s[12:13]
|
|
; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v29, s[10:11]
|
|
; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v28, s[8:9]
|
|
; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v27, s[6:7]
|
|
; GFX8-NEXT: v_addc_u32_e64 v0, vcc, v0, v25, s[4:5]
|
|
; GFX8-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, v16
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, v11
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, v12
|
|
; GFX8-NEXT: v_mov_b32_e32 v7, v9
|
|
; GFX8-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX9-LABEL: v_mul_i256:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v14, 0
|
|
; GFX9-NEXT: v_mul_lo_u32 v28, v4, v11
|
|
; GFX9-NEXT: v_mul_lo_u32 v29, v3, v12
|
|
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v1, v13, v[16:17]
|
|
; GFX9-NEXT: v_mul_lo_u32 v30, v2, v13
|
|
; GFX9-NEXT: v_mul_lo_u32 v27, v5, v10
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v12, v[18:19]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v3, v11, v[16:17]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v10, 0
|
|
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v10, v[18:19]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v1, v9, v[16:17]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v5, v9, v[20:21]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[18:19], vcc, v2, v8, v[22:23]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v6, v8, v[16:17]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v12, 0
|
|
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v1, v11, v[16:17]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v24, 0, 1, s[4:5]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v2, v10, v[20:21]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v20, s[4:5], 0, v24, s[4:5]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v3, v9, v[16:17]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v16, s[4:5], 0, v20, s[4:5]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v4, v8, v[24:25]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v24, s[4:5], 0, v16, s[4:5]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v0, v13, v[21:22]
|
|
; GFX9-NEXT: v_mul_lo_u32 v25, v6, v9
|
|
; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[6:7], v1, v12, v[16:17]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[8:9], v2, v11, v[21:22]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[10:11], v3, v10, v[16:17]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[14:15], v0, v11, v[19:20]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[12:13], v4, v9, v[21:22]
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v26, vcc
|
|
; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v1, v10, v[16:17]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15]
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
|
|
; GFX9-NEXT: v_mad_u64_u32 v[10:11], vcc, v2, v9, v[19:20]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[16:17], s[16:17], v0, v8, 0
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v6, vcc
|
|
; GFX9-NEXT: v_mad_u64_u32 v[19:20], vcc, v3, v8, v[10:11]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[21:22], s[14:15], v5, v8, v[12:13]
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v2, vcc
|
|
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[16:17], v0, v9, v[17:18]
|
|
; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17]
|
|
; GFX9-NEXT: v_mul_lo_u32 v0, v0, v15
|
|
; GFX9-NEXT: v_mad_u64_u32 v[11:12], vcc, v1, v8, v[2:3]
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v6, v19, vcc
|
|
; GFX9-NEXT: v_mul_lo_u32 v10, v1, v14
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v4, v20, vcc
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v21, vcc
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v24, v22, vcc
|
|
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, v23, v0, vcc
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v10, s[14:15]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v30, s[12:13]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v29, s[10:11]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v28, s[8:9]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v27, s[6:7]
|
|
; GFX9-NEXT: v_addc_co_u32_e64 v0, vcc, v0, v25, s[4:5]
|
|
; GFX9-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v7, v8, v[0:1]
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, v16
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, v11
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, v12
|
|
; GFX9-NEXT: v_mov_b32_e32 v7, v9
|
|
; GFX9-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX10-LABEL: v_mul_i256:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX10-NEXT: v_mov_b32_e32 v16, v0
|
|
; GFX10-NEXT: v_mov_b32_e32 v17, v1
|
|
; GFX10-NEXT: v_mov_b32_e32 v18, v2
|
|
; GFX10-NEXT: v_mov_b32_e32 v19, v3
|
|
; GFX10-NEXT: v_mov_b32_e32 v20, v4
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v16, v14, 0
|
|
; GFX10-NEXT: v_mov_b32_e32 v21, v5
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, v6
|
|
; GFX10-NEXT: v_mov_b32_e32 v22, v7
|
|
; GFX10-NEXT: v_mul_lo_u32 v31, v17, v14
|
|
; GFX10-NEXT: v_mul_lo_u32 v29, v20, v11
|
|
; GFX10-NEXT: v_mul_lo_u32 v30, v16, v15
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v13, v[1:2]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v16, v12, 0
|
|
; GFX10-NEXT: v_mul_lo_u32 v27, v0, v9
|
|
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s4, v18, v12, v[3:4]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v17, v11, v[1:2]
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, s4
|
|
; GFX10-NEXT: v_mad_u64_u32 v[25:26], s4, v16, v10, 0
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s5, v19, v11, v[5:6]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v20, v10, v[1:2]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[23:24], s4, v21, v9, v[3:4]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo
|
|
; GFX10-NEXT: v_mad_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[6:7], s4, v0, v8, v[23:24]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v17, v9, v[25:26]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v25, vcc_lo, 0, v3, vcc_lo
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v28, 0, 1, s4
|
|
; GFX10-NEXT: v_mul_lo_u32 v26, v21, v10
|
|
; GFX10-NEXT: v_mad_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s4, v18, v8, v[0:1]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s6, v16, v8, 0
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v28, s4, 0, v28, s4
|
|
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s5, v17, v12, v[23:24]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[23:24], s6, v16, v11, v[3:4]
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, 1, s6
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v18, v11, v[5:6]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s6, v17, v10, v[23:24]
|
|
; GFX10-NEXT: v_mul_lo_u32 v23, v19, v12
|
|
; GFX10-NEXT: v_mul_lo_u32 v24, v18, v13
|
|
; GFX10-NEXT: v_mad_u64_u32 v[11:12], s7, v19, v10, v[3:4]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v10, s6, 0, v14, s6
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s6, v18, v9, v[5:6]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v14, s6, 0, v10, s6
|
|
; GFX10-NEXT: v_mad_u64_u32 v[5:6], s6, v20, v9, v[11:12]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[10:11], s8, v16, v9, v[1:2]
|
|
; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s8
|
|
; GFX10-NEXT: v_mad_u64_u32 v[12:13], s8, v19, v8, v[3:4]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v16, s8, 0, v14, s8
|
|
; GFX10-NEXT: v_mad_u64_u32 v[14:15], s8, v21, v8, v[5:6]
|
|
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s9, v17, v8, v[10:11]
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s9, v9, v12, s9
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s9, v28, v13, s9
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s9, v16, v14, s9
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s9, v25, v15, s9
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s9, v7, v30, s9
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s8, v7, v31, s8
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s6, v7, v24, s6
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s6, v7, v23, s7
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v29, s4
|
|
; GFX10-NEXT: v_add_co_ci_u32_e64 v7, s4, v7, v26, s5
|
|
; GFX10-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v27, vcc_lo
|
|
; GFX10-NEXT: v_mad_u64_u32 v[7:8], s4, v22, v8, v[7:8]
|
|
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX11-LABEL: v_mul_i256:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX11-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
|
|
; GFX11-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3
|
|
; GFX11-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v16, v14, 0
|
|
; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v23, v7
|
|
; GFX11-NEXT: v_mov_b32_e32 v22, v8
|
|
; GFX11-NEXT: v_mad_u64_u32 v[26:27], null, v16, v10, 0
|
|
; GFX11-NEXT: v_mul_lo_u32 v28, v0, v9
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v17, v13, v[1:2]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v16, v12, 0
|
|
; GFX11-NEXT: v_mul_lo_u32 v30, v20, v11
|
|
; GFX11-NEXT: v_mul_lo_u32 v15, v16, v15
|
|
; GFX11-NEXT: v_mul_lo_u32 v14, v17, v14
|
|
; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v18, v12, v[3:4]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], s0, v17, v11, v[1:2]
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v19, v11, v[5:6]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v20, v10, v[1:2]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[24:25], null, v21, v9, v[3:4]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo
|
|
; GFX11-NEXT: v_mad_u64_u32 v[4:5], vcc_lo, v20, v22, v[1:2]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v22, v[24:25]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[0:1], s0, v17, v9, v[26:27]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v3, vcc_lo
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s0
|
|
; GFX11-NEXT: v_mul_lo_u32 v27, v21, v10
|
|
; GFX11-NEXT: v_mad_u64_u32 v[24:25], vcc_lo, v16, v13, v[5:6]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[2:3], s0, v18, v22, v[0:1]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v8, s0
|
|
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v22, 0
|
|
; GFX11-NEXT: v_mad_u64_u32 v[5:6], s1, v17, v12, v[24:25]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[24:25], s2, v16, v11, v[3:4]
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, 1, s2
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], s0, v18, v11, v[5:6]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[5:6], s2, v17, v10, v[24:25]
|
|
; GFX11-NEXT: v_mul_lo_u32 v24, v19, v12
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, v8, s2
|
|
; GFX11-NEXT: v_mul_lo_u32 v25, v18, v13
|
|
; GFX11-NEXT: v_mad_u64_u32 v[11:12], s3, v19, v10, v[3:4]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], s2, v18, v9, v[5:6]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, v8, s2
|
|
; GFX11-NEXT: v_mad_u64_u32 v[5:6], s2, v20, v9, v[11:12]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[10:11], s4, v16, v9, v[1:2]
|
|
; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, 1, s4
|
|
; GFX11-NEXT: v_mad_u64_u32 v[8:9], s4, v19, v22, v[3:4]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v18, null, 0, v13, s4
|
|
; GFX11-NEXT: v_mad_u64_u32 v[12:13], s4, v21, v22, v[5:6]
|
|
; GFX11-NEXT: v_mad_u64_u32 v[1:2], s5, v17, v22, v[10:11]
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s5, v16, v8, s5
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, s5, v29, v9, s5
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, s5, v18, v12, s5
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v6, s5, v26, v13, s5
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v15, s5
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v14, s4
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v25, s2
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s3
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s0
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, s1
|
|
; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, v7, v28, vcc_lo
|
|
; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v23, v22, v[9:10]
|
|
; GFX11-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX12-LABEL: v_mul_i256:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX12-NEXT: s_wait_expcnt 0x0
|
|
; GFX12-NEXT: s_wait_samplecnt 0x0
|
|
; GFX12-NEXT: s_wait_bvhcnt 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
|
|
; GFX12-NEXT: v_dual_mov_b32 v18, v2 :: v_dual_mov_b32 v19, v3
|
|
; GFX12-NEXT: v_dual_mov_b32 v20, v4 :: v_dual_mov_b32 v21, v5
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v14, 0
|
|
; GFX12-NEXT: v_mov_b32_e32 v0, v6
|
|
; GFX12-NEXT: v_mov_b32_e32 v22, v7
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[25:26], null, v16, v10, 0
|
|
; GFX12-NEXT: v_mul_lo_u32 v31, v17, v14
|
|
; GFX12-NEXT: v_mul_lo_u32 v27, v0, v9
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v17, v13, v[1:2]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v16, v12, 0
|
|
; GFX12-NEXT: v_mul_lo_u32 v29, v20, v11
|
|
; GFX12-NEXT: v_mul_lo_u32 v30, v16, v15
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], null, v18, v12, v[3:4]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s0, v17, v11, v[1:2]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v7, 0, 1, s0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v19, v11, v[5:6]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], vcc_lo, v18, v10, v[3:4]
|
|
; GFX12-NEXT: s_wait_alu 0xfffd
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v7, vcc_lo
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v20, v10, v[1:2]
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], vcc_lo, v19, v9, v[5:6]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], null, v21, v9, v[3:4]
|
|
; GFX12-NEXT: s_wait_alu 0xfffd
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v7, vcc_lo
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[4:5], vcc_lo, v20, v8, v[1:2]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v8, v[23:24]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[25:26]
|
|
; GFX12-NEXT: s_wait_alu 0xfffd
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v3, vcc_lo
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v28, 0, 1, s0
|
|
; GFX12-NEXT: v_mul_lo_u32 v26, v21, v10
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], vcc_lo, v16, v13, v[5:6]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], s0, v18, v8, v[0:1]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v28, s0
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v16, v8, 0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s1, v17, v12, v[23:24]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[23:24], s2, v16, v11, v[3:4]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v14, 0, 1, s2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s0, v18, v11, v[5:6]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s2, v17, v10, v[23:24]
|
|
; GFX12-NEXT: v_mul_lo_u32 v23, v19, v12
|
|
; GFX12-NEXT: v_mul_lo_u32 v24, v18, v13
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], s3, v19, v10, v[3:4]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v10, null, 0, v14, s2
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], s2, v18, v9, v[5:6]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v14, null, 0, v10, s2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[5:6], s2, v20, v9, v[11:12]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], s4, v16, v9, v[1:2]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[12:13], s4, v19, v8, v[3:4]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v16, null, 0, v14, s4
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[14:15], s4, v21, v8, v[5:6]
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], s5, v17, v8, v[10:11]
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v3, s5, v9, v12, s5
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v4, s5, v28, v13, s5
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v5, s5, v16, v14, s5
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v6, s5, v25, v15, s5
|
|
; GFX12-NEXT: s_wait_alu 0xf1ff
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v30, s5
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v31, s4
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v24, s2
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v23, s3
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v29, s0
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v26, s1
|
|
; GFX12-NEXT: s_wait_alu 0xfffd
|
|
; GFX12-NEXT: v_add_co_ci_u32_e64 v7, null, v7, v27, vcc_lo
|
|
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v22, v8, v[7:8]
|
|
; GFX12-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; GFX1250-LABEL: v_mul_i256:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: v_dual_mov_b32 v16, v0 :: v_dual_mov_b32 v17, v1
|
|
; GFX1250-NEXT: v_mul_lo_u32 v30, v4, v11
|
|
; GFX1250-NEXT: v_mul_lo_u32 v29, v5, v10
|
|
; GFX1250-NEXT: v_mul_lo_u32 v31, v3, v12
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v14, 0
|
|
; GFX1250-NEXT: v_mul_lo_u32 v32, v2, v13
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v17, v13, v[0:1]
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v12, 0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v2, v12, v[18:19]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v17, v11, v[0:1]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX1250-NEXT: v_cndmask_b32_e64 v22, 0, 1, s0
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v3, v11, v[20:21]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v2, v10, v[18:19]
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v22, null, 0, v22, vcc_lo
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v4, v10, v[0:1]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], vcc_lo, v3, v9, v[20:21]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v26, null, 0, v22, vcc_lo
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[20:21], v5, v9, v[18:19]
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v16, v10, 0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], vcc_lo, v4, v8, v[0:1]
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v28, null, 0, v26, vcc_lo
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[24:25], v6, v8, v[20:21]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v9, v[18:19]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
; GFX1250-NEXT: v_dual_mov_b32 v18, v23 :: v_dual_mov_b32 v19, v24
|
|
; GFX1250-NEXT: v_mul_lo_u32 v24, v6, v9
|
|
; GFX1250-NEXT: v_cndmask_b32_e64 v6, 0, 1, s0
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], vcc_lo, v16, v13, v[18:19]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s0, v2, v8, v[0:1]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, null, 0, v6, s0
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[0:1], s0, v17, v12, v[20:21]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX1250-NEXT: v_dual_mov_b32 v20, v19 :: v_dual_mov_b32 v21, v22
|
|
; GFX1250-NEXT: v_mov_b32_e32 v13, v18
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s2, v16, v11, v[20:21]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s1, v2, v11, v[0:1]
|
|
; GFX1250-NEXT: v_cndmask_b32_e64 v11, 0, 1, s2
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v16, v8, 0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[26:27], s2, v17, v10, v[22:23]
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v33, null, 0, v11, s2
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[22:23], s3, v3, v10, v[20:21]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX1250-NEXT: v_mov_b32_e32 v12, v1
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s2, v2, v9, v[26:27]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[20:21], s6, v16, v9, v[12:13]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[18:19], s4, v4, v9, v[22:23]
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v33, s2
|
|
; GFX1250-NEXT: v_mul_lo_u32 v2, v16, v15
|
|
; GFX1250-NEXT: v_mul_lo_u32 v9, v17, v14
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[12:13], s2, v3, v8, v[10:11]
|
|
; GFX1250-NEXT: v_cndmask_b32_e64 v3, 0, 1, s6
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s2
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[10:11], s5, v5, v8, v[18:19]
|
|
; GFX1250-NEXT: v_mad_co_u64_u32 v[14:15], s2, v17, v8, v[20:21]
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v3, s2, v3, v12, s2
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v4, s2, v6, v13, s2
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v5, s2, v1, v10, s2
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v6, s2, v28, v11, s2
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v25, v2, s2
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, v15
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v9, s5
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v32, s4
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v31, s3
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v30, s1
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v29, s0
|
|
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
; GFX1250-NEXT: v_add_co_ci_u32_e64 v1, null, v1, v24, vcc_lo
|
|
; GFX1250-NEXT: v_mad_u32 v7, v7, v8, v1
|
|
; GFX1250-NEXT: v_mov_b32_e32 v1, v14
|
|
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
|
|
%result = mul i256 %num, %den
|
|
ret i256 %result
|
|
}
|
|
|
|
define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
|
; GFX7-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mov_b32 s2, 0
|
|
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, 0x50
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v5, 0
|
|
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: flat_load_dword v4, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, 0x50
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: global_load_dword v4, v[2:3], off
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, 0x50
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v5, 0
|
|
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: global_load_dword v4, v[2:3], off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
|
|
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: global_load_b32 v4, v[2:3], off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
|
|
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: global_load_b32 v4, v[2:3], off
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, 0x50, v4, 0
|
|
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
|
|
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
|
|
; GFX1250-NEXT: s_endpgm
|
|
%val = load i32, ptr addrspace(1) %in, align 4
|
|
%ext = zext i32 %val to i64
|
|
%mul = mul i64 %ext, 80
|
|
store i64 %mul, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
|
; GFX7-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, 0x50
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
|
|
; GFX7-NEXT: s_mov_b32 s2, -1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0
|
|
; GFX7-NEXT: s_mul_i32 s4, s3, 0x50
|
|
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX7-NEXT: v_readfirstlane_b32 s5, v0
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
|
|
; GFX8-NEXT: s_mulk_i32 s2, 0x50
|
|
; GFX8-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
|
|
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_load_dword s3, s[2:3], 0x0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_mul_i32 s2, s3, 0x50
|
|
; GFX10-NEXT: s_mul_hi_u32 s3, s3, 0x50
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_load_b32 s3, s[2:3], 0x0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_mul_i32 s2, s3, 0x50
|
|
; GFX11-NEXT: s_mul_hi_u32 s3, s3, 0x50
|
|
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
|
|
; GFX12-NEXT: s_mov_b32 s3, 0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: s_mul_u64_zext_with_sregs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
|
|
; GFX1250-NEXT: s_wait_xcnt 0x0
|
|
; GFX1250-NEXT: s_mov_b32 s3, 0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
|
|
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
%val = load i32, ptr addrspace(1) %in, align 4
|
|
%ext = zext i32 %val to i64
|
|
%mul = mul i64 %ext, 80
|
|
store i64 %mul, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
|
; GFX7-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_mov_b32 s2, 0
|
|
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX7-NEXT: s_mov_b64 s[0:1], 0
|
|
; GFX7-NEXT: buffer_load_dword v4, v[2:3], s[0:3], 0 addr64
|
|
; GFX7-NEXT: v_mov_b32_e32 v6, 0x50
|
|
; GFX7-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX7-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v6, 0
|
|
; GFX7-NEXT: v_ashrrev_i32_e32 v7, 31, v4
|
|
; GFX7-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX7-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v7, v6, v[5:6]
|
|
; GFX7-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: flat_load_dword v4, v[2:3]
|
|
; GFX8-NEXT: v_mov_b32_e32 v6, 0x50
|
|
; GFX8-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
|
|
; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v4
|
|
; GFX8-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX8-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6]
|
|
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: global_load_dword v4, v[2:3], off
|
|
; GFX9-NEXT: v_mov_b32_e32 v6, 0x50
|
|
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], v4, v6, 0
|
|
; GFX9-NEXT: v_ashrrev_i32_e32 v7, 31, v4
|
|
; GFX9-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[0:1], v7, v6, v[5:6]
|
|
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: global_load_dword v4, v[2:3], off
|
|
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, 0x50, v4, 0
|
|
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v4
|
|
; GFX10-NEXT: v_mad_u64_u32 v[3:4], s0, 0x50, v4, v[3:4]
|
|
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: global_load_b32 v4, v[2:3], off
|
|
; GFX11-NEXT: s_waitcnt vmcnt(0)
|
|
; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, 0x50, v4, 0
|
|
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v4
|
|
; GFX11-NEXT: v_mov_b32_e32 v5, v3
|
|
; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, 0x50, v6, v[5:6]
|
|
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: global_load_b32 v4, v[2:3], off
|
|
; GFX12-NEXT: s_wait_loadcnt 0x0
|
|
; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, 0x50, v4, 0
|
|
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: global_load_b32 v4, v[2:3], off
|
|
; GFX1250-NEXT: s_wait_loadcnt 0x0
|
|
; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
|
|
; GFX1250-NEXT: global_store_b64 v[0:1], v[2:3], off
|
|
; GFX1250-NEXT: s_endpgm
|
|
%val = load i32, ptr addrspace(1) %in, align 4
|
|
%ext = sext i32 %val to i64
|
|
%mul = mul i64 %ext, 80
|
|
store i64 %mul, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
|
; GFX7-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX7: ; %bb.0:
|
|
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, 0x50
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: s_load_dword s3, s[2:3], 0x0
|
|
; GFX7-NEXT: s_mov_b32 s2, -1
|
|
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX7-NEXT: v_mul_hi_u32 v0, s3, v0
|
|
; GFX7-NEXT: s_ashr_i32 s5, s3, 31
|
|
; GFX7-NEXT: s_mul_i32 s4, s3, 0x50
|
|
; GFX7-NEXT: s_mulk_i32 s5, 0x50
|
|
; GFX7-NEXT: v_readfirstlane_b32 s3, v0
|
|
; GFX7-NEXT: s_add_u32 s5, s5, s3
|
|
; GFX7-NEXT: v_mov_b32_e32 v0, s4
|
|
; GFX7-NEXT: v_mov_b32_e32 v1, s5
|
|
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
|
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
|
|
; GFX7-NEXT: s_endpgm
|
|
;
|
|
; GFX8-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX8: ; %bb.0:
|
|
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, 0x50
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: s_load_dword s2, s[2:3], 0x0
|
|
; GFX8-NEXT: v_mov_b32_e32 v3, s1
|
|
; GFX8-NEXT: v_mov_b32_e32 v2, s0
|
|
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX8-NEXT: v_mul_hi_u32 v0, s2, v0
|
|
; GFX8-NEXT: s_ashr_i32 s3, s2, 31
|
|
; GFX8-NEXT: s_mulk_i32 s2, 0x50
|
|
; GFX8-NEXT: s_mulk_i32 s3, 0x50
|
|
; GFX8-NEXT: v_readfirstlane_b32 s4, v0
|
|
; GFX8-NEXT: s_add_u32 s3, s3, s4
|
|
; GFX8-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX8-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
|
|
; GFX8-NEXT: s_endpgm
|
|
;
|
|
; GFX9-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX9: ; %bb.0:
|
|
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX9-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_load_dword s3, s[2:3], 0x0
|
|
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
|
|
; GFX9-NEXT: s_mul_i32 s2, s3, 0x50
|
|
; GFX9-NEXT: s_mul_hi_u32 s3, s3, 0x50
|
|
; GFX9-NEXT: s_mulk_i32 s4, 0x50
|
|
; GFX9-NEXT: s_add_u32 s3, s4, s3
|
|
; GFX9-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX9-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX9-NEXT: s_endpgm
|
|
;
|
|
; GFX10-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX10: ; %bb.0:
|
|
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
|
|
; GFX10-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_load_dword s2, s[2:3], 0x0
|
|
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX10-NEXT: s_ashr_i32 s3, s2, 31
|
|
; GFX10-NEXT: s_mul_hi_u32 s4, s2, 0x50
|
|
; GFX10-NEXT: s_mulk_i32 s3, 0x50
|
|
; GFX10-NEXT: s_mulk_i32 s2, 0x50
|
|
; GFX10-NEXT: s_add_i32 s3, s4, s3
|
|
; GFX10-NEXT: v_mov_b32_e32 v0, s2
|
|
; GFX10-NEXT: v_mov_b32_e32 v1, s3
|
|
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
|
|
; GFX10-NEXT: s_endpgm
|
|
;
|
|
; GFX11-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX11: ; %bb.0:
|
|
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX11-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0
|
|
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX11-NEXT: s_ashr_i32 s3, s2, 31
|
|
; GFX11-NEXT: s_mul_hi_u32 s4, s2, 0x50
|
|
; GFX11-NEXT: s_mulk_i32 s3, 0x50
|
|
; GFX11-NEXT: s_mulk_i32 s2, 0x50
|
|
; GFX11-NEXT: s_add_i32 s3, s4, s3
|
|
; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX11-NEXT: s_endpgm
|
|
;
|
|
; GFX12-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX12: ; %bb.0:
|
|
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX12-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0
|
|
; GFX12-NEXT: s_wait_kmcnt 0x0
|
|
; GFX12-NEXT: s_ashr_i32 s3, s2, 31
|
|
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
|
|
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
|
|
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX12-NEXT: s_endpgm
|
|
;
|
|
; GFX1250-LABEL: s_mul_u64_sext_with_sregs:
|
|
; GFX1250: ; %bb.0:
|
|
; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
|
|
; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
|
|
; GFX1250-NEXT: v_mov_b32_e32 v2, 0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
|
|
; GFX1250-NEXT: s_wait_kmcnt 0x0
|
|
; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
|
|
; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
|
|
; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], 0x50
|
|
; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
|
|
; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
|
|
; GFX1250-NEXT: s_endpgm
|
|
%val = load i32, ptr addrspace(1) %in, align 4
|
|
%ext = sext i32 %val to i64
|
|
%mul = mul i64 %ext, 80
|
|
store i64 %mul, ptr addrspace(1) %out, align 8
|
|
ret void
|
|
}
|