[AMDGPU] Lower S_ABSDIFF_I32 to VALU instructions (#167691)
Added support for lowering the scalar S_ABSDIFF_I32 instruction to equivalent VALU operations.
This commit is contained in:
parent
7b7a4222c8
commit
4cd836181f
@ -7824,6 +7824,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
|
||||
Inst.eraseFromParent();
|
||||
return;
|
||||
|
||||
case AMDGPU::S_ABSDIFF_I32:
|
||||
lowerScalarAbsDiff(Worklist, Inst);
|
||||
Inst.eraseFromParent();
|
||||
return;
|
||||
|
||||
case AMDGPU::S_CBRANCH_SCC0:
|
||||
case AMDGPU::S_CBRANCH_SCC1: {
|
||||
// Clear unused bits of vcc
|
||||
@ -8473,6 +8478,37 @@ void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
|
||||
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
|
||||
}
|
||||
|
||||
void SIInstrInfo::lowerScalarAbsDiff(SIInstrWorklist &Worklist,
|
||||
MachineInstr &Inst) const {
|
||||
MachineBasicBlock &MBB = *Inst.getParent();
|
||||
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
|
||||
MachineBasicBlock::iterator MII = Inst;
|
||||
const DebugLoc &DL = Inst.getDebugLoc();
|
||||
|
||||
MachineOperand &Dest = Inst.getOperand(0);
|
||||
MachineOperand &Src1 = Inst.getOperand(1);
|
||||
MachineOperand &Src2 = Inst.getOperand(2);
|
||||
Register SubResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
|
||||
unsigned SubOp =
|
||||
ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
|
||||
|
||||
BuildMI(MBB, MII, DL, get(SubOp), SubResultReg)
|
||||
.addReg(Src1.getReg())
|
||||
.addReg(Src2.getReg());
|
||||
|
||||
BuildMI(MBB, MII, DL, get(SubOp), TmpReg).addImm(0).addReg(SubResultReg);
|
||||
|
||||
BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
|
||||
.addReg(SubResultReg)
|
||||
.addReg(TmpReg);
|
||||
|
||||
MRI.replaceRegWith(Dest.getReg(), ResultReg);
|
||||
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
|
||||
}
|
||||
|
||||
void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
|
||||
MachineInstr &Inst) const {
|
||||
MachineBasicBlock &MBB = *Inst.getParent();
|
||||
|
||||
@ -136,6 +136,8 @@ private:
|
||||
|
||||
void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
|
||||
|
||||
void lowerScalarAbsDiff(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
|
||||
|
||||
void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
|
||||
|
||||
void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
|
||||
|
||||
@ -1,6 +1,44 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
|
||||
|
||||
|
||||
define amdgpu_gs float @absdiff_valu_input_regression() {
|
||||
; CHECK-LABEL: absdiff_valu_input_regression:
|
||||
; CHECK: ; %bb.0: ; %bb
|
||||
; CHECK-NEXT: s_mov_b32 s0, 0
|
||||
; CHECK-NEXT: .LBB0_1: ; %bb1
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: s_mov_b32 s1, s0
|
||||
; CHECK-NEXT: s_or_b32 s0, s0, 1
|
||||
; CHECK-NEXT: s_cmp_gt_i32 s1, 0
|
||||
; CHECK-NEXT: s_cbranch_scc1 .LBB0_1
|
||||
; CHECK-NEXT: ; %bb.2: ; %bb11
|
||||
; CHECK-NEXT: v_med3_i32 v0, s1, 0, 1
|
||||
; CHECK-NEXT: v_sub_u32_e32 v0, 0, v0
|
||||
; CHECK-NEXT: v_sub_u32_e32 v1, 0, v0
|
||||
; CHECK-NEXT: v_max_i32_e32 v0, v0, v1
|
||||
; CHECK-NEXT: ; return to shader part epilog
|
||||
bb:
|
||||
br label %bb1
|
||||
|
||||
bb1: ; preds = %bb1, %bb
|
||||
%i = phi i32 [ 0, %bb ], [ %i9, %bb1 ]
|
||||
%i2 = phi i32 [ 0, %bb ], [ %i5, %bb1 ]
|
||||
%i3 = or i32 %i2, 1
|
||||
%i4 = or i32 %i3, 0
|
||||
%i5 = call i32 @llvm.smax.i32(i32 %i, i32 0)
|
||||
%i6 = call i32 @llvm.umin.i32(i32 %i5, i32 1)
|
||||
%i7 = sub i32 0, %i6
|
||||
%i8 = call i32 @llvm.abs.i32(i32 %i7, i1 false)
|
||||
%i9 = or i32 %i, 1
|
||||
%i10 = icmp sgt i32 %i, 0
|
||||
br i1 %i10, label %bb1, label %bb11
|
||||
|
||||
bb11: ; preds = %bb1
|
||||
%i12 = bitcast i32 %i8 to float
|
||||
ret float %i12
|
||||
}
|
||||
|
||||
define amdgpu_ps i16 @absdiff_i16_false(i16 inreg %arg0, i16 inreg %arg1) {
|
||||
; CHECK-LABEL: absdiff_i16_false:
|
||||
; CHECK: ; %bb.0:
|
||||
|
||||
31
llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir
Normal file
31
llvm/test/CodeGen/AMDGPU/move-to-valu-absdiff.mir
Normal file
@ -0,0 +1,31 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GFX8 %s
|
||||
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-fix-sgpr-copies -o - %s | FileCheck --check-prefix=GFX12 %s
|
||||
|
||||
---
|
||||
name: absdiff_i32
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX8-LABEL: name: absdiff_i32
|
||||
; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX8-NEXT: {{ $}}
|
||||
; GFX8-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec
|
||||
; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10
|
||||
; GFX8-NEXT: [[V_SUB_CO_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit-def $vcc, implicit $exec
|
||||
; GFX8-NEXT: [[V_SUB_CO_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_CO_U32_e32 0, [[V_SUB_CO_U32_e32_]], implicit-def $vcc, implicit $exec
|
||||
; GFX8-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_CO_U32_e32_]], [[V_SUB_CO_U32_e32_1]], implicit $exec
|
||||
;
|
||||
; GFX12-LABEL: name: absdiff_i32
|
||||
; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GFX12-NEXT: {{ $}}
|
||||
; GFX12-NEXT: [[V_LSHL_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec
|
||||
; GFX12-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 10
|
||||
; GFX12-NEXT: [[V_SUB_U32_e32_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 [[S_MOV_B32_]], [[V_LSHL_ADD_U32_e64_]], implicit $exec
|
||||
; GFX12-NEXT: [[V_SUB_U32_e32_1:%[0-9]+]]:vgpr_32 = V_SUB_U32_e32 0, [[V_SUB_U32_e32_]], implicit $exec
|
||||
; GFX12-NEXT: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_SUB_U32_e32_]], [[V_SUB_U32_e32_1]], implicit $exec
|
||||
%0:vgpr_32 = V_LSHL_ADD_U32_e64 $vgpr0, $vgpr1, $vgpr2, implicit $exec
|
||||
%1:sreg_32 = COPY %0:vgpr_32
|
||||
%2:sreg_32 = S_MOV_B32 10
|
||||
%3:sreg_32 = S_ABSDIFF_I32 killed %2:sreg_32, %1:sreg_32, implicit-def dead $scc
|
||||
...
|
||||
Loading…
x
Reference in New Issue
Block a user