Harsha Jagasia 2b1c6df5a6 [Hexagon] Performance regression with b2b
For code below:
        {
                r7 = addasl(r3,r0,#2)
                r8 = addasl(r3,r2,#2)
                r5 = memw(r3+r0<<#2)
                r6 = memw(r3+r2<<#2)
        }
        {
                p1 = cmp.gtu(r6,r5)
                if (p1.new) memw(r8+#0) = r5
                if (p1.new) memw(r7+#0) = r6
        }
        {
                r0 = mux(p1,r2,r4)

        }

In packetizer, a new packet is created for the cmp instruction since
there arent enough resources in previous packet. Also it is determined
that the cmp stalls by 2 cycles since it depends on the prior load of r5.
In current packetizer implementation, the predicated store is evaluated
for whether it can go in the same packet as compare, and since the compare
stalls, the stall of the predicated store does not matter and it can go in
the same packet as the cmp. However the predicated store will stall for
more cycles because of its dependence on the addasl instruction and to
avoid that stall we can put it in a new packet.

Improve the packetizer to check if an instruction being added to packet
will stall longer than instruction already in packet and if so create a
new packet.
2022-01-04 14:09:47 -08:00

65 lines
2.4 KiB
LLVM

; RUN: llc -march=hexagon -O3 < %s | FileCheck %s
; if instruction being considered for addition to packet has higher latency,
; end existing packet and start a new one.
; CHECK: .LBB0_4:
; CHECK: p{{[0-3]+}} = cmp.gtu(r{{[0-9]+}},r{{[0-9]+}})
; CHECK-NEXT: }
@array = external dso_local local_unnamed_addr global i32*, align 4
; Function Attrs: nofree norecurse nounwind
define dso_local void @NumSift(i32 %i, i32 %j) local_unnamed_addr #0 {
entry:
%add36 = shl i32 %i, 1
%cmp.not37 = icmp ugt i32 %add36, %j
br i1 %cmp.not37, label %while.end, label %while.body.lr.ph
while.body.lr.ph: ; preds = %entry
%0 = load i32*, i32** @array, align 4
%add16 = add i32 %j, 1
br label %while.body
while.body: ; preds = %while.body.lr.ph, %if.end17
%add39 = phi i32 [ %add36, %while.body.lr.ph ], [ %add, %if.end17 ]
%i.addr.038 = phi i32 [ %i, %while.body.lr.ph ], [ %i.addr.1, %if.end17 ]
%cmp2 = icmp ult i32 %add39, %j
br i1 %cmp2, label %if.then, label %if.end7
if.then: ; preds = %while.body
%arrayidx = getelementptr inbounds i32, i32* %0, i32 %add39
%1 = load i32, i32* %arrayidx, align 4
%add3 = or i32 %add39, 1
%arrayidx4 = getelementptr inbounds i32, i32* %0, i32 %add3
%2 = load i32, i32* %arrayidx4, align 4
%cmp5 = icmp ult i32 %1, %2
%spec.select = select i1 %cmp5, i32 %add3, i32 %add39
br label %if.end7
if.end7: ; preds = %if.then, %while.body
%k.0 = phi i32 [ %add39, %while.body ], [ %spec.select, %if.then ]
%arrayidx8 = getelementptr inbounds i32, i32* %0, i32 %i.addr.038
%3 = load i32, i32* %arrayidx8, align 4
%arrayidx9 = getelementptr inbounds i32, i32* %0, i32 %k.0
%4 = load i32, i32* %arrayidx9, align 4
%cmp10 = icmp ult i32 %3, %4
br i1 %cmp10, label %if.then11, label %if.end17
if.then11: ; preds = %if.end7
store i32 %3, i32* %arrayidx9, align 4
store i32 %4, i32* %arrayidx8, align 4
br label %if.end17
if.end17: ; preds = %if.end7, %if.then11
%i.addr.1 = phi i32 [ %k.0, %if.then11 ], [ %add16, %if.end7 ]
%add = shl i32 %i.addr.1, 1
%cmp.not = icmp ugt i32 %add, %j
br i1 %cmp.not, label %while.end, label %while.body
while.end: ; preds = %if.end17, %entry
ret void
}
attributes #0 = { "target-cpu"="hexagonv65" }