Fabian Ritter f2749f6645
[LowerMemIntrinsics][AMDGPU] Optimize memset.pattern lowering (#185901)
This patch changes the lowering of the [experimental.memset.pattern intrinsic](https://llvm.org/docs/LangRef.html#llvm-experimental-memset-pattern-intrinsic)
to match the optimized memset and memcpy lowering when possible. (The tl;dr of
memset.pattern is that it is like memset, except that you can use it to set
values that are wider than a single byte.)

The memset.pattern lowering now queries `TTI::getMemcpyLoopLoweringType` for a
preferred memory access type. If the size of that type is a multiple of the set
value's type, and if both types have consistent store and alloc sizes (since
memset.pattern behaves in a way that is not well suitable for access widening
if store and alloc size differ), the memset.pattern is lowered into two loops:
a main loop that stores a sufficiently wide vector splat of the SetValue with
the preferred memory access type and a residual loop that covers the remaining
set values individually.

In contrast to the memset lowering, this patch doesn't include a specialized
lowering for residual loops with known constant lengths. Loops that are
statically known to be unreachable will not be emitted.

For backends that don't override `TTI::getMemcpyLoopLoweringType`, the
generated code is mostly unchanged except for more consistent basic block
names, no more `br i1 false` for memset.patterns with known size, and a flipped
loop condition for memset.patterns with known size (see test changes).

This is a follow-up to a similar patch for memset: #169040
2026-03-13 10:37:33 +01:00

128 lines
6.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -mtriple=riscv64 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_1(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 0)
ret void
}
define void @memset_pattern_i128_16(ptr %a, i128 %value) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_16(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: br label %[[LOADSTORELOOP:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP1]], align 1
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 16
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT:.*]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 16, i1 0)
ret void
}
define void @memset_pattern_i127_x(ptr %a, i127 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i127_x(
; CHECK-SAME: ptr [[A:%.*]], i127 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i127, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: store i127 [[VALUE]], ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
tail call void @llvm.experimental.memset.pattern(ptr %a, i127 %value, i64 %x, i1 0)
ret void
}
define void @memset_pattern_i128_x(ptr %a, i128 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i128_x(
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP4]], align 1
; CHECK-NEXT: [[TMP6]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP6]], [[X]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 %x, i1 0)
ret void
}
define void @memset_pattern_i256_x(ptr %a, i256 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i256_x(
; CHECK-SAME: ptr [[A:%.*]], i256 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i256, ptr [[A]], i64 [[TMP2]]
; CHECK-NEXT: store i256 [[VALUE]], ptr [[TMP4]], align 1
; CHECK-NEXT: [[TMP6]] = add i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP6]], [[X]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: ret void
;
tail call void @llvm.experimental.memset.pattern(ptr %a, i256 %value, i64 %x, i1 0)
ret void
}
; The common alignment of the allocation of the pattern stride (its allocation
; size) and the destination pointer should be used.
define void @memset_pattern_i15_x_alignment(ptr %a, i15 %value, i64 %x) nounwind {
; CHECK-LABEL: define void @memset_pattern_i15_x_alignment(
; CHECK-SAME: ptr [[A:%.*]], i15 [[VALUE:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP1]], label %[[LOADSTORELOOP:.*]], label %[[SPLIT:.*]]
; CHECK: [[LOADSTORELOOP]]:
; CHECK-NEXT: [[TMP3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP3]]
; CHECK-NEXT: store i15 [[VALUE]], ptr [[TMP2]], align 1
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP3]], 1
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
; CHECK: [[SPLIT]]:
; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i64 [[X]], 0
; CHECK-NEXT: br i1 [[TMP6]], label %[[LOADSTORELOOP2:.*]], label %[[SPLIT1:.*]]
; CHECK: [[LOADSTORELOOP2]]:
; CHECK-NEXT: [[TMP11:%.*]] = phi i64 [ 0, %[[SPLIT]] ], [ [[TMP9:%.*]], %[[LOADSTORELOOP2]] ]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i15, ptr [[A]], i64 [[TMP11]]
; CHECK-NEXT: store i15 [[VALUE]], ptr [[TMP8]], align 2
; CHECK-NEXT: [[TMP9]] = add i64 [[TMP11]], 1
; CHECK-NEXT: [[TMP10:%.*]] = icmp ult i64 [[TMP9]], [[X]]
; CHECK-NEXT: br i1 [[TMP10]], label %[[LOADSTORELOOP2]], label %[[SPLIT1]]
; CHECK: [[SPLIT1]]:
; CHECK-NEXT: ret void
;
call void @llvm.experimental.memset.pattern(ptr align 1 %a, i15 %value, i64 %x, i1 0)
call void @llvm.experimental.memset.pattern(ptr align 2 %a, i15 %value, i64 %x, i1 0)
ret void
}