
This extends the existing fold which concatenates X and Y if they are sequential subvectors extracted from the same source. By using combineConcatVectorOps we can recognise other patterns where X and Y can be concatenated for free (e.g. sequential loads, concatenating repeated instructions etc.), which allows the VPERMV3 fold to be a lot more aggressive. This required combineConcatVectorOps to be extended to fold the additional case of "concat(extract_subvector(x,lo), extract_subvector(x,hi)) -> extract_subvector(x)", similar to the original VPERMV3 fold where "x" was larger than the concat result type. This also exposes more cases where we have repeated vector/subvector loads if they have multiple uses - e.g. where we're loading a ymm and the lo/hi xmm pairs independently - in the past we've always considered this to be relatively benign, but I'm not certain if we should now do more to keep these from splitting?
20 lines
874 B
LLVM
20 lines
874 B
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s
|
|
|
|
define <2 x i32> @PR97968(<16 x i32> %a0) {
|
|
; CHECK-LABEL: PR97968:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpmovsxbd {{.*#+}} xmm1 = [2,7,2,7]
|
|
; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0
|
|
; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
%sub0 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
|
%sub1 = shufflevector <16 x i32> %a0, <16 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
%elt0 = extractelement <4 x i32> %sub0, i64 2
|
|
%elt7 = extractelement <4 x i32> %sub1, i64 3
|
|
%scl0 = insertelement <2 x i32> undef, i32 %elt0, i32 0
|
|
%scl1 = insertelement <2 x i32> %scl0, i32 %elt7, i32 1
|
|
ret <2 x i32> %scl1
|
|
}
|