If the combined shuffle mask requires zero elements, we don't currently have much chance of matching them against the expected source vector. This patch uses the SelectionDAG::MaskedVectorIsZero wrapper to attempt to determine if the expected lement we want to use is already known to be zero. I've also tightened up the ExpectedMask assertion to always be in range - we're never giving it a target shuffle mask that has sentinels at all - allowing to remove some of the confusing bounds checks. This attempts to address some of the regressions uncovered by D129150 where we more aggressively fold shuffles as AND / 'clear' masks which results in more combined shuffles using SM_SentinelZero. Differential Revision: https://reviews.llvm.org/D129207
121 lines
5.4 KiB
LLVM
121 lines
5.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2
|
|
|
|
; https://bugs.llvm.org/show_bug.cgi?id=51615
|
|
; We can not replace a wide volatile load with a broadcast-from-memory,
|
|
; because that would narrow the load, which isn't legal for volatiles.
|
|
|
|
@g0 = external dso_local global <2 x double>, align 16
|
|
define void @volatile_load_2_elts() {
|
|
; AVX-LABEL: volatile_load_2_elts:
|
|
; AVX: # %bb.0:
|
|
; AVX-NEXT: vmovaps g0(%rip), %xmm0
|
|
; AVX-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0]
|
|
; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
|
|
; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3,4,5],ymm2[6,7]
|
|
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
|
|
; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1]
|
|
; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
|
|
; AVX-NEXT: vmovaps %ymm0, (%rax)
|
|
; AVX-NEXT: vmovaps %ymm1, (%rax)
|
|
; AVX-NEXT: vzeroupper
|
|
; AVX-NEXT: retq
|
|
;
|
|
; AVX2-LABEL: volatile_load_2_elts:
|
|
; AVX2: # %bb.0:
|
|
; AVX2-NEXT: vmovaps g0(%rip), %xmm0
|
|
; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0
|
|
; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
|
|
; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
|
|
; AVX2-NEXT: vmovaps %ymm0, (%rax)
|
|
; AVX2-NEXT: vmovaps %ymm2, (%rax)
|
|
; AVX2-NEXT: vzeroupper
|
|
; AVX2-NEXT: retq
|
|
%i = load volatile <2 x double>, ptr @g0, align 16
|
|
%i1 = shufflevector <2 x double> %i, <2 x double> poison, <4 x i32> <i32 undef, i32 0, i32 undef, i32 0>
|
|
%shuffle1 = shufflevector <4 x double> %i1, <4 x double> zeroinitializer, <8 x i32> <i32 6, i32 7, i32 3, i32 6, i32 7, i32 1, i32 7, i32 1>
|
|
store volatile <8 x double> %shuffle1, ptr undef, align 64
|
|
ret void
|
|
}
|
|
|
|
@g1 = external dso_local global <1 x double>, align 16
|
|
define void @volatile_load_1_elt() {
|
|
; ALL-LABEL: volatile_load_1_elt:
|
|
; ALL: # %bb.0:
|
|
; ALL-NEXT: vbroadcastsd g1(%rip), %ymm0
|
|
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
|
|
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
|
|
; ALL-NEXT: vmovaps %ymm0, (%rax)
|
|
; ALL-NEXT: vmovaps %ymm2, (%rax)
|
|
; ALL-NEXT: vzeroupper
|
|
; ALL-NEXT: retq
|
|
%i = load volatile <1 x double>, ptr @g1, align 16
|
|
%i1 = shufflevector <1 x double> %i, <1 x double> poison, <4 x i32> <i32 undef, i32 0, i32 undef, i32 0>
|
|
%shuffle1 = shufflevector <4 x double> %i1, <4 x double> zeroinitializer, <8 x i32> <i32 6, i32 7, i32 3, i32 6, i32 7, i32 1, i32 7, i32 1>
|
|
store volatile <8 x double> %shuffle1, ptr undef, align 64
|
|
ret void
|
|
}
|
|
|
|
@g2 = external dso_local global <2 x float>, align 16
|
|
define void @volatile_load_2_elts_bitcast() {
|
|
; ALL-LABEL: volatile_load_2_elts_bitcast:
|
|
; ALL: # %bb.0:
|
|
; ALL-NEXT: vbroadcastsd g2(%rip), %ymm0
|
|
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; ALL-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
|
|
; ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
|
|
; ALL-NEXT: vmovaps %ymm0, (%rax)
|
|
; ALL-NEXT: vmovaps %ymm2, (%rax)
|
|
; ALL-NEXT: vzeroupper
|
|
; ALL-NEXT: retq
|
|
%i0 = load volatile <2 x float>, ptr @g2, align 16
|
|
%i = bitcast <2 x float> %i0 to <1 x double>
|
|
%i1 = shufflevector <1 x double> %i, <1 x double> poison, <4 x i32> <i32 undef, i32 0, i32 undef, i32 0>
|
|
%shuffle1 = shufflevector <4 x double> %i1, <4 x double> zeroinitializer, <8 x i32> <i32 6, i32 7, i32 3, i32 6, i32 7, i32 1, i32 7, i32 1>
|
|
store volatile <8 x double> %shuffle1, ptr undef, align 64
|
|
ret void
|
|
}
|
|
|
|
define void @elts_from_consecutive_loads(ptr %arg, ptr %arg12, ptr %arg13, float %arg14, i1 %arg15) {
|
|
; ALL-LABEL: elts_from_consecutive_loads:
|
|
; ALL: # %bb.0: # %bb
|
|
; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
|
; ALL-NEXT: .p2align 4, 0x90
|
|
; ALL-NEXT: .LBB3_1: # %bb16
|
|
; ALL-NEXT: # =>This Loop Header: Depth=1
|
|
; ALL-NEXT: # Child Loop BB3_2 Depth 2
|
|
; ALL-NEXT: testb $1, %cl
|
|
; ALL-NEXT: je .LBB3_1
|
|
; ALL-NEXT: .p2align 4, 0x90
|
|
; ALL-NEXT: .LBB3_2: # %bb17
|
|
; ALL-NEXT: # Parent Loop BB3_1 Depth=1
|
|
; ALL-NEXT: # => This Inner Loop Header: Depth=2
|
|
; ALL-NEXT: movl (%rdi), %eax
|
|
; ALL-NEXT: vbroadcastss (%rdi), %ymm2
|
|
; ALL-NEXT: movl %eax, (%rsi)
|
|
; ALL-NEXT: vmovaps %ymm2, (%rdx)
|
|
; ALL-NEXT: vucomiss %xmm1, %xmm0
|
|
; ALL-NEXT: jne .LBB3_2
|
|
; ALL-NEXT: jp .LBB3_2
|
|
; ALL-NEXT: jmp .LBB3_1
|
|
bb:
|
|
br label %bb16
|
|
|
|
bb16: ; preds = %bb17, %bb16, %bb
|
|
br i1 %arg15, label %bb17, label %bb16
|
|
|
|
bb17: ; preds = %bb17, %bb16
|
|
%tmp = load <2 x i64>, ptr %arg, align 16
|
|
%tmp18 = extractelement <2 x i64> %tmp, i32 0
|
|
%tmp19 = trunc i64 %tmp18 to i32
|
|
store i32 %tmp19, ptr %arg12, align 4
|
|
%tmp20 = insertelement <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %tmp19, i32 0
|
|
%tmp21 = shufflevector <8 x i32> %tmp20, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
|
|
store <8 x i32> %tmp21, ptr %arg13, align 32
|
|
%tmp22 = fcmp une float %arg14, 0.000000e+00
|
|
br i1 %tmp22, label %bb17, label %bb16
|
|
}
|