This change enables the LoadStoreVectorizer to merge and vectorize
contiguous chains even when their scalar element types differ, as long
as the total bitwidth matches. To do so, we rebase offsets between
chains, normalize value types to a common integer type, and insert the
necessary casts around loads and stores. This uncovers more
vectorization opportunities and explains the expected codegen updates
across AMDGPU tests.
Key changes:
- Chain merging
- Build contiguous subchains and then merge adjacent ones when:
- They refer to the same underlying pointer object and address space.
- They are either all loads or all stores.
- A constant leader-to-leader delta exists.
- Rebasing one chain into the other's coordinate space does not overlap.
- All elements have equal total bit width.
- Rebase the second chain by the computed delta and append it to the
first.
- Type normalization and casting
- Normalize merged chains to a common integer type sized to the total
bits.
- For loads: create a new load of the normalized type, copy metadata,
and cast back to the original type for uses if needed.
- For stores: bitcast the value to the normalized type and store that.
- Insert zext/trunc for integer size changes; use bit-or-pointer casts
when sizes match.
- Cleanups
- Erase replaced instructions and DCE pointer operands when safe.
- New helpers: computeLeaderDelta, chainsOverlapAfterRebase,
rebaseChain, normalizeChainToType, and allElemsMatchTotalBits.
Impact:
- Increases vectorization opportunities across mixed-typed but
size-compatible access chains.
- Large set of expected AMDGPU codegen diffs due to more/changed
vectorization.
This PR resolves #97715.
216 lines
8.8 KiB
LLVM
216 lines
8.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
|
|
; RUN: opt -S -passes=instcombine %s | FileCheck %s
|
|
|
|
@test.data = private unnamed_addr constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4
|
|
@test.ptrdata = private unnamed_addr constant [8 x ptr] [ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null, ptr null], align 8
|
|
|
|
; Verify that InstCombine copies range metadata when cloning a load as part of
|
|
; replacing an alloca initialized via memcpy from a constant. OK
|
|
define i32 @copy_range_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_range_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !range [[RNG0:![0-9]+]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !range !0
|
|
ret i32 %l
|
|
}
|
|
|
|
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
|
|
|
|
!0 = !{i32 0, i32 100}
|
|
|
|
; Verify TBAA metadata on a cloned load is preserved. OK
|
|
define i32 @copy_tbaa_metadata_after_memcpy(i64 %x, ptr %sink) {
|
|
; CHECK-LABEL: define i32 @copy_tbaa_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]], ptr [[SINK:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[SCALAR_TYPE_TBAA1:![0-9]+]]
|
|
; CHECK-NEXT: store i32 [[L]], ptr [[SINK]], align 4
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !tbaa !1
|
|
store i32 %l, ptr %sink, align 4
|
|
ret i32 %l
|
|
}
|
|
|
|
!1 = !{!2, !2, i64 0}
|
|
!2 = !{!"scalar type", !3}
|
|
!3 = !{!"root"}
|
|
|
|
; Verify dereferenceable_or_null metadata on a cloned load is preserved
|
|
; when the loaded value type is a pointer. OK
|
|
define ptr @copy_deref_or_null_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define ptr @copy_deref_or_null_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: ret ptr null
|
|
;
|
|
entry:
|
|
%data = alloca [8 x ptr], align 8
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x
|
|
%l = load ptr, ptr %arrayidx, align 8, !dereferenceable_or_null !4
|
|
ret ptr %l
|
|
}
|
|
|
|
!4 = !{i64 8}
|
|
|
|
; Verify nonnull metadata on a cloned load is preserved
|
|
; when the loaded value type is a pointer. OK
|
|
define ptr @copy_nonnull_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define ptr @copy_nonnull_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: ret ptr null
|
|
;
|
|
entry:
|
|
%data = alloca [8 x ptr], align 8
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 8 %data, ptr align 8 @test.ptrdata, i64 64, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x ptr], ptr %data, i64 0, i64 %x
|
|
%l = load ptr, ptr %arrayidx, align 8, !nonnull !5
|
|
ret ptr %l
|
|
}
|
|
|
|
!5 = !{}
|
|
|
|
; Verify invariant.load metadata on a cloned load is preserved. OK
|
|
define i32 @copy_invariant_load_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_invariant_load_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !invariant.load [[META4:![0-9]+]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !invariant.load !5
|
|
ret i32 %l
|
|
}
|
|
|
|
; Verify alias.scope and noalias metadata on a cloned load are preserved. OK
|
|
define i32 @copy_aliasscope_noalias_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_aliasscope_noalias_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !alias.scope [[META5:![0-9]+]], !noalias [[META5]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !alias.scope !6, !noalias !6
|
|
ret i32 %l
|
|
}
|
|
|
|
; Verify nontemporal metadata on a cloned load is preserved.OK
|
|
define i32 @copy_nontemporal_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_nontemporal_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !nontemporal [[META8:![0-9]+]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !nontemporal !9
|
|
ret i32 %l
|
|
}
|
|
|
|
; Verify access group metadata on a cloned load is preserved. OK
|
|
define i32 @copy_access_group_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_access_group_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !llvm.access.group !10
|
|
ret i32 %l
|
|
}
|
|
|
|
; Verify noalias.addrspace metadata on a cloned load is preserved.
|
|
define i32 @copy_noalias_addrspace_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_noalias_addrspace_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !noalias.addrspace [[META10:![0-9]+]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !noalias.addrspace !12
|
|
ret i32 %l
|
|
}
|
|
|
|
; Verify llvm.mem.parallel_loop_access metadata on a cloned load is preserved. OK
|
|
define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(i64 %x) {
|
|
; CHECK-LABEL: define i32 @copy_mem_parallel_loop_access_metadata_after_memcpy(
|
|
; CHECK-SAME: i64 [[X:%.*]]) {
|
|
; CHECK-NEXT: [[ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr @test.data, i64 [[X]]
|
|
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.mem.parallel_loop_access [[META11:![0-9]+]]
|
|
; CHECK-NEXT: ret i32 [[L]]
|
|
;
|
|
entry:
|
|
%data = alloca [8 x i32], align 4
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 4 %data, ptr align 4 @test.data, i64 32, i1 false)
|
|
%arrayidx = getelementptr inbounds [8 x i32], ptr %data, i64 0, i64 %x
|
|
%l = load i32, ptr %arrayidx, align 4, !llvm.mem.parallel_loop_access !13
|
|
ret i32 %l
|
|
}
|
|
|
|
!6 = !{!7}
|
|
!7 = distinct !{!7, !8}
|
|
!8 = distinct !{!8}
|
|
!9 = !{i32 1}
|
|
!10 = distinct !{}
|
|
!12 = !{i32 5, i32 6}
|
|
!13 = !{!14}
|
|
!14 = distinct !{}
|
|
|
|
|
|
|
|
;.
|
|
; CHECK: [[RNG0]] = !{i32 0, i32 100}
|
|
; CHECK: [[SCALAR_TYPE_TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
|
|
; CHECK: [[META2]] = !{!"scalar type", [[META3:![0-9]+]]}
|
|
; CHECK: [[META3]] = !{!"root"}
|
|
; CHECK: [[META4]] = !{}
|
|
; CHECK: [[META5]] = !{[[META6:![0-9]+]]}
|
|
; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
|
|
; CHECK: [[META7]] = distinct !{[[META7]]}
|
|
; CHECK: [[META8]] = !{i32 1}
|
|
; CHECK: [[ACC_GRP9]] = distinct !{}
|
|
; CHECK: [[META10]] = !{i32 5, i32 6}
|
|
; CHECK: [[META11]] = !{[[META12:![0-9]+]]}
|
|
; CHECK: [[META12]] = distinct !{}
|
|
;.
|