getVectorInstrCostHelper would return costs of zero for vector inserts/extracts that move data between GPR and vector registers, if there was no 'real' use, i.e. there was no corresponding existing instruction. This meant that passes like LoopVectorize and SLPVectorizer, which likely are the main users of the interface, would understimate the cost of insert/extracts that move data between GPR and vector registers, which has non-trivial costs. The patch removes the special case and only returns costs of zero for lane 0 if it there is no need to transfer between integer and vector registers. This impacts a number of SLP test, and most of them look like general improvements.I think the change should make things more accurate for any AArch64 target, but if not it could also just be Apple CPU specific. I am seeing +2% end-to-end improvements on SLP-heavy workloads. PR: https://github.com/llvm/llvm-project/pull/146526
78 lines
4.4 KiB
LLVM
78 lines
4.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: opt -mtriple=arm64-apple-ios -S -passes=slp-vectorizer < %s | FileCheck %s
|
|
; vectorization requires a vector GEP + extracts, but the cost is offset by being able to efficiently vectorize the rest of the tree
|
|
|
|
define void @should_vectorize_gep(ptr %base1, ptr %base2, ptr %base_gep) {
|
|
; CHECK-LABEL: define void @should_vectorize_gep
|
|
; CHECK-SAME: (ptr [[BASE1:%.*]], ptr [[BASE2:%.*]], ptr [[BASE_GEP:%.*]]) {
|
|
; CHECK-NEXT: bb:
|
|
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[BASE1]], align 2
|
|
; CHECK-NEXT: [[ZEXT1:%.*]] = zext i32 [[LOAD1]] to i64
|
|
; CHECK-NEXT: [[LOAD2:%.*]] = load i32, ptr [[BASE2]], align 2
|
|
; CHECK-NEXT: [[ZEXT2:%.*]] = zext i32 [[LOAD2]] to i64
|
|
; CHECK-NEXT: [[SUB:%.*]] = sub i64 [[ZEXT1]], [[ZEXT2]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_1:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr i32, ptr [[BASE1]], i64 1
|
|
; CHECK-NEXT: [[GETELEMENTPTR2:%.*]] = getelementptr i32, ptr [[BASE2]], i64 1
|
|
; CHECK-NEXT: [[LOAD3:%.*]] = load i32, ptr [[GETELEMENTPTR1]], align 2
|
|
; CHECK-NEXT: [[ZEXT3:%.*]] = zext i32 [[LOAD3]] to i64
|
|
; CHECK-NEXT: [[LOAD4:%.*]] = load i32, ptr [[GETELEMENTPTR2]], align 2
|
|
; CHECK-NEXT: [[ZEXT4:%.*]] = zext i32 [[LOAD4]] to i64
|
|
; CHECK-NEXT: [[SUB2:%.*]] = sub i64 [[ZEXT3]], [[ZEXT4]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_2:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB2]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR3:%.*]] = getelementptr i32, ptr [[BASE1]], i64 2
|
|
; CHECK-NEXT: [[GETELEMENTPTR4:%.*]] = getelementptr i32, ptr [[BASE2]], i64 2
|
|
; CHECK-NEXT: [[LOAD5:%.*]] = load i32, ptr [[GETELEMENTPTR3]], align 2
|
|
; CHECK-NEXT: [[ZEXT5:%.*]] = zext i32 [[LOAD5]] to i64
|
|
; CHECK-NEXT: [[LOAD6:%.*]] = load i32, ptr [[GETELEMENTPTR4]], align 2
|
|
; CHECK-NEXT: [[ZEXT6:%.*]] = zext i32 [[LOAD6]] to i64
|
|
; CHECK-NEXT: [[SUB3:%.*]] = sub i64 [[ZEXT5]], [[ZEXT6]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_3:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB3]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR5:%.*]] = getelementptr i32, ptr [[BASE1]], i64 3
|
|
; CHECK-NEXT: [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr [[BASE2]], i64 3
|
|
; CHECK-NEXT: [[LOAD7:%.*]] = load i32, ptr [[GETELEMENTPTR5]], align 2
|
|
; CHECK-NEXT: [[ZEXT7:%.*]] = zext i32 [[LOAD7]] to i64
|
|
; CHECK-NEXT: [[LOAD8:%.*]] = load i32, ptr [[GETELEMENTPTR6]], align 2
|
|
; CHECK-NEXT: [[ZEXT8:%.*]] = zext i32 [[LOAD8]] to i64
|
|
; CHECK-NEXT: [[SUB4:%.*]] = sub i64 [[ZEXT7]], [[ZEXT8]]
|
|
; CHECK-NEXT: [[GETELEMENTPTR_RES_4:%.*]] = getelementptr i32, ptr [[BASE_GEP]], i64 [[SUB4]]
|
|
; CHECK-NEXT: call void @use_4(ptr [[GETELEMENTPTR_RES_1]], ptr [[GETELEMENTPTR_RES_2]], ptr [[GETELEMENTPTR_RES_3]], ptr [[GETELEMENTPTR_RES_4]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%load1 = load i32, ptr %base1, align 2
|
|
%zext1 = zext i32 %load1 to i64
|
|
%load2 = load i32, ptr %base2, align 2
|
|
%zext2 = zext i32 %load2 to i64
|
|
%sub = sub i64 %zext1, %zext2
|
|
%getelementptr.res.1 = getelementptr i32, ptr %base_gep, i64 %sub
|
|
%getelementptr1 = getelementptr i32, ptr %base1, i64 1
|
|
%getelementptr2 = getelementptr i32, ptr %base2, i64 1
|
|
%load3 = load i32, ptr %getelementptr1, align 2
|
|
%zext3 = zext i32 %load3 to i64
|
|
%load4 = load i32, ptr %getelementptr2, align 2
|
|
%zext4= zext i32 %load4 to i64
|
|
%sub2 = sub i64 %zext3, %zext4
|
|
%getelementptr.res.2 = getelementptr i32, ptr %base_gep, i64 %sub2
|
|
%getelementptr3 = getelementptr i32, ptr %base1, i64 2
|
|
%getelementptr4 = getelementptr i32, ptr %base2, i64 2
|
|
%load5 = load i32, ptr %getelementptr3, align 2
|
|
%zext5 = zext i32 %load5 to i64
|
|
%load6 = load i32, ptr %getelementptr4, align 2
|
|
%zext6 = zext i32 %load6 to i64
|
|
%sub3 = sub i64 %zext5, %zext6
|
|
%getelementptr.res.3 = getelementptr i32, ptr %base_gep, i64 %sub3
|
|
%getelementptr5 = getelementptr i32, ptr %base1, i64 3
|
|
%getelementptr6 = getelementptr i32, ptr %base2, i64 3
|
|
%load7 = load i32, ptr %getelementptr5, align 2
|
|
%zext7 = zext i32 %load7 to i64
|
|
%load8 = load i32, ptr %getelementptr6, align 2
|
|
%zext8 = zext i32 %load8 to i64
|
|
%sub4 = sub i64 %zext7, %zext8
|
|
%getelementptr.res.4 = getelementptr i32, ptr %base_gep, i64 %sub4
|
|
call void @use_4(ptr %getelementptr.res.1, ptr %getelementptr.res.2, ptr %getelementptr.res.3, ptr %getelementptr.res.4)
|
|
ret void
|
|
}
|
|
|
|
declare void @use_4(ptr, ptr, ptr, ptr)
|