llvm-project/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics-threshold.ll
Fabian Ritter d24a6754ce
[LowerMemIntrinsics] Optimize memset lowering (#169040)
This patch changes the memset lowering to match the optimized memcpy lowering.
The memset lowering now queries TTI.getMemcpyLoopLoweringType for a preferred
memory access type. If that type is larger than a byte, the memset is lowered
into two loops: a main loop that stores a sufficiently wide vector splat of the
SetValue with the preferred memory access type and a residual loop that covers
the remaining bytes individually. If the memset size is statically known, the
residual loop is replaced by a sequence of stores.

This improves memset performance on gfx1030 (AMDGPU) in microbenchmarks by
around 7-20x.

I'm planning similar treatment for memset.pattern as a follow-up PR.

For SWDEV-543208.
2026-02-04 13:35:13 +01:00

91 lines
4.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
; RUN: opt -S -mtriple=amdgcn-- -pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=8 %s | FileCheck -check-prefix=OPT8 %s
; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=4 %s | FileCheck -check-prefix=OPT4 %s
; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=0 %s | FileCheck -check-prefix=OPT0 %s
; RUN: opt -S -mtriple=amdgcn-- -passes=pre-isel-intrinsic-lowering -mem-intrinsic-expand-size=-1 %s | FileCheck -check-prefix=OPT_NEG %s
; Test the -mem-intrinsic-expand-size flag works.
; Make sure we can always eliminate the intrinsic, even at 0.
define amdgpu_kernel void @memset_size_0(ptr addrspace(1) %dst, i8 %val) {
; OPT8-LABEL: @memset_size_0(
; OPT8-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
; OPT8-NEXT: ret void
;
; OPT4-LABEL: @memset_size_0(
; OPT4-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_0(
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_0(
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 0, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 0, i1 false)
ret void
}
define amdgpu_kernel void @memset_size_4(ptr addrspace(1) %dst, i8 %val) {
; OPT8-LABEL: @memset_size_4(
; OPT8-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
; OPT8-NEXT: ret void
;
; OPT4-LABEL: @memset_size_4(
; OPT4-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_4(
; OPT0-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[VAL:%.*]], i64 0
; OPT0-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <4 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
; OPT0-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <4 x i8> [[SETVALUE_SPLAT_SPLAT]] to i32
; OPT0-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
; OPT0-NEXT: store i32 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_4(
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 4, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 4, i1 false)
ret void
}
define amdgpu_kernel void @memset_size_8(ptr addrspace(1) %dst, i8 %val) {
; OPT8-LABEL: @memset_size_8(
; OPT8-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
; OPT8-NEXT: ret void
;
; OPT4-LABEL: @memset_size_8(
; OPT4-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[VAL:%.*]], i64 0
; OPT4-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <8 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
; OPT4-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <8 x i8> [[SETVALUE_SPLAT_SPLAT]] to i64
; OPT4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
; OPT4-NEXT: store i64 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1
; OPT4-NEXT: ret void
;
; OPT0-LABEL: @memset_size_8(
; OPT0-NEXT: [[SETVALUE_SPLAT_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[VAL:%.*]], i64 0
; OPT0-NEXT: [[SETVALUE_SPLAT_SPLAT:%.*]] = shufflevector <8 x i8> [[SETVALUE_SPLAT_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer
; OPT0-NEXT: [[SETVALUE_SPLAT_CAST:%.*]] = bitcast <8 x i8> [[SETVALUE_SPLAT_SPLAT]] to i64
; OPT0-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST:%.*]], i64 0
; OPT0-NEXT: store i64 [[SETVALUE_SPLAT_CAST]], ptr addrspace(1) [[TMP1]], align 1
; OPT0-NEXT: ret void
;
; OPT_NEG-LABEL: @memset_size_8(
; OPT_NEG-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST:%.*]], i8 [[VAL:%.*]], i64 8, i1 false)
; OPT_NEG-NEXT: ret void
;
call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 %val, i64 8, i1 false)
ret void
}
declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, i8, i64, i1 immarg) #0
attributes #0 = { argmemonly nounwind willreturn writeonly }