Alex Bradbury 3877039fd1
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall (#126736)
In order to keep the change as incremental as possible, this only
introduces the memset.pattern intrinsic in cases where memset_pattern16
would have been used. Future patches can enable it on targets that don't
have the intrinsic, and select it in cases where the libcall isn't
directly usable. As the memset.pattern intrinsic takes the number of
times to store the pattern as an argument unlike memset_pattern16 which
takes the number of bytes to write, we no longer try to form an i128
pattern.

Special care is taken for cases where multiple stores in the same loop
iteration were combined to form a single pattern. For such cases, we
inherit the limitation that loops such as the following are supported:

```
for (unsigned i = 0; i < 2 * n; i += 2) {
  f[i] = 2;
  f[i+1] = 2;
}
```

But the following doesn't result in a memset.pattern (even though it
could be, by forming an appropriate pattern):
```
for (unsigned i = 0; i < 2 * n; i += 2) {
  f[i] = 2;
  f[i+1] = 3;
}
```

Addressing this existing deficiency is left for a follow-up due to a
desire not to change too much at once (i.e. to target equivalence to the
current codegen).

A command line option is introduced to force the selection of the
intrinsic even in cases it wouldn't be (i.e. in cases where the libcall
wouldn't have been selected). This is intended as a transitionary option
for testing and experimentation, to be removed at a later point.

The only platforms this should impact are those that have the memset_pattern16 libcall (Apple platforms). Testing performed to check for no unexpected codegen changes is described here https://github.com/llvm/llvm-project/pull/126736#issuecomment-3005097468
2025-07-09 13:48:15 +01:00

127 lines
5.5 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -passes=loop-idiom < %s -S | FileCheck %s
target datalayout = "e-p:64:64:64:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
;void test(int *f, unsigned n) {
; for (unsigned i = 0; i < 2 * n; i += 2) {
; f[i] = 0;
; f[i+1] = 0;
; }
;}
define void @test(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[N:%.*]], 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1
; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 3
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 8
; CHECK-NEXT: call void @llvm.memset.p0.i32(ptr align 4 [[F:%.*]], i8 0, i32 [[TMP4]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[F]], i32 [[TMP5]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 2
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INDVARS_IV_NEXT]], [[TMP0]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%0 = shl i32 %n, 1
%cmp1 = icmp eq i32 %0, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, ptr %f, i32 %indvars.iv
store i32 0, ptr %arrayidx, align 4
%1 = or disjoint i32 %indvars.iv, 1
%arrayidx2 = getelementptr inbounds i32, ptr %f, i32 %1
store i32 0, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 2
%cmp = icmp ult i32 %indvars.iv.next, %0
br i1 %cmp, label %for.body, label %for.end.loopexit
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void test_pattern(int *f, unsigned n) {
; for (unsigned i = 0; i < 2 * n; i += 2) {
; f[i] = 2;
; f[i+1] = 2;
; }
;}
define void @test_pattern(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @test_pattern(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[MUL:%.*]] = shl i32 [[N:%.*]], 1
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MUL]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[MUL]], -1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add nuw i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2
; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i32(ptr align 4 [[F:%.*]], i32 2, i32 [[TMP3]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i32 [[INDVARS_IV]]
; CHECK-NEXT: [[X1:%.*]] = or disjoint i32 [[INDVARS_IV]], 1
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[F]], i32 [[X1]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 2
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INDVARS_IV_NEXT]], [[MUL]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%mul = shl i32 %n, 1
%cmp1 = icmp eq i32 %mul, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, ptr %f, i32 %indvars.iv
store i32 2, ptr %arrayidx, align 4
%x1 = or disjoint i32 %indvars.iv, 1
%arrayidx2 = getelementptr inbounds i32, ptr %f, i32 %x1
store i32 2, ptr %arrayidx2, align 4
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 2
%cmp = icmp ult i32 %indvars.iv.next, %mul
br i1 %cmp, label %for.body, label %for.end.loopexit
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind ssp }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
;.