llvm-project/llvm/test/CodeGen/X86/fma-commute-loop.ll
Craig Topper b6e2796114 [X86][TwoAddressInstructionPass] Teach tryInstructionCommute to continue checking for commutable FMA operands in more cases.
Previously we would only check for another commutable operand if the first commute was an aggressive commute.

But if we have two kill operands and neither is tied to the def at the start, we should consider both operands as the one to use as the new def.

This improves the loop in the fma-commute-loop.ll test. This test is derived from a post from discourse here https://llvm.discourse.group/t/unnecessary-vmovapd-instructions-generated-can-you-hint-in-favor-of-vfmadd231pd/582

Differential Revision: https://reviews.llvm.org/D75016
2020-03-01 16:38:08 -08:00

115 lines
5.6 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s
define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2, <8 x double>* %arg3, <8 x double>* %arg4, <8 x double>* %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, double* %arg13, double* %arg14) nounwind {
; CHECK-LABEL: eggs:
; CHECK: ## %bb.0: ## %bb
; CHECK-NEXT: pushq %r15
; CHECK-NEXT: pushq %r14
; CHECK-NEXT: pushq %r13
; CHECK-NEXT: pushq %r12
; CHECK-NEXT: pushq %rbx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r11
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r12
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r15
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r14
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; CHECK-NEXT: leaq (%rbx,%r14,8), %r14
; CHECK-NEXT: leaq (%rbx,%r15,8), %r15
; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: xorl %ebx, %ebx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r13
; CHECK-NEXT: addq %r12, %r13
; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12
; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4
; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: LBB0_1: ## %bb15
; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6
; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7
; CHECK-NEXT: vmovupd (%rax,%r12,8), %zmm8
; CHECK-NEXT: vbroadcastsd (%r15,%rbx,8), %zmm9
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2
; CHECK-NEXT: vbroadcastsd (%r14,%rbx,8), %zmm9
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4
; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5
; CHECK-NEXT: incq %rbx
; CHECK-NEXT: cmpq %rbx, %r10
; CHECK-NEXT: jne LBB0_1
; CHECK-NEXT: ## %bb.2: ## %bb51
; CHECK-NEXT: vmovapd %zmm0, (%rdi)
; CHECK-NEXT: vmovapd %zmm1, (%rsi)
; CHECK-NEXT: vmovapd %zmm2, (%rdx)
; CHECK-NEXT: vmovapd %zmm3, (%rcx)
; CHECK-NEXT: vmovapd %zmm4, (%r8)
; CHECK-NEXT: vmovapd %zmm5, (%r9)
; CHECK-NEXT: popq %rbx
; CHECK-NEXT: popq %r12
; CHECK-NEXT: popq %r13
; CHECK-NEXT: popq %r14
; CHECK-NEXT: popq %r15
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
bb:
br label %bb15
bb15: ; preds = %bb15, %bb
%tmp = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp38, %bb15 ]
%tmp16 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp39, %bb15 ]
%tmp17 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp40, %bb15 ]
%tmp18 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp46, %bb15 ]
%tmp19 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp47, %bb15 ]
%tmp20 = phi <8 x double> [ zeroinitializer, %bb ], [ %tmp48, %bb15 ]
%tmp21 = phi i64 [ 0, %bb ], [ %tmp49, %bb15 ]
%tmp22 = getelementptr inbounds double, double* %arg14, i64 %arg11
%tmp23 = bitcast double* %tmp22 to <8 x double>*
%tmp24 = load <8 x double>, <8 x double>* %tmp23, align 8
%tmp25 = add i64 %arg10, %arg6
%tmp26 = getelementptr inbounds double, double* %arg14, i64 %tmp25
%tmp27 = bitcast double* %tmp26 to <8 x double>*
%tmp28 = load <8 x double>, <8 x double>* %tmp27, align 8
%tmp29 = add i64 %arg10, %arg7
%tmp30 = getelementptr inbounds double, double* %arg14, i64 %tmp29
%tmp31 = bitcast double* %tmp30 to <8 x double>*
%tmp32 = load <8 x double>, <8 x double>* %tmp31, align 8
%tmp33 = add i64 %tmp21, %arg8
%tmp34 = getelementptr inbounds double, double* %arg13, i64 %tmp33
%tmp35 = load double, double* %tmp34, align 8
%tmp36 = insertelement <8 x double> undef, double %tmp35, i32 0
%tmp37 = shufflevector <8 x double> %tmp36, <8 x double> undef, <8 x i32> zeroinitializer
%tmp38 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp37, <8 x double> %tmp)
%tmp39 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp37, <8 x double> %tmp16)
%tmp40 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp37, <8 x double> %tmp17)
%tmp41 = add i64 %tmp21, %arg9
%tmp42 = getelementptr inbounds double, double* %arg13, i64 %tmp41
%tmp43 = load double, double* %tmp42, align 8
%tmp44 = insertelement <8 x double> undef, double %tmp43, i32 0
%tmp45 = shufflevector <8 x double> %tmp44, <8 x double> undef, <8 x i32> zeroinitializer
%tmp46 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp24, <8 x double> %tmp45, <8 x double> %tmp18)
%tmp47 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp28, <8 x double> %tmp45, <8 x double> %tmp19)
%tmp48 = call <8 x double> @llvm.fmuladd.v8f64(<8 x double> %tmp32, <8 x double> %tmp45, <8 x double> %tmp20)
%tmp49 = add nuw nsw i64 %tmp21, 1
%tmp50 = icmp eq i64 %tmp49, %arg12
br i1 %tmp50, label %bb51, label %bb15
bb51: ; preds = %bb15
store <8 x double> %tmp38, <8 x double>* %arg
store <8 x double> %tmp39, <8 x double>* %arg1
store <8 x double> %tmp40, <8 x double>* %arg2
store <8 x double> %tmp46, <8 x double>* %arg3
store <8 x double> %tmp47, <8 x double>* %arg4
store <8 x double> %tmp48, <8 x double>* %arg5
ret void
}
declare <8 x double> @llvm.fmuladd.v8f64(<8 x double>, <8 x double>, <8 x double>)