Alex Bradbury 3877039fd1
[LoopIdiom] Select llvm.experimental.memset.pattern intrinsic rather than memset_pattern16 libcall (#126736)
In order to keep the change as incremental as possible, this only
introduces the memset.pattern intrinsic in cases where memset_pattern16
would have been used. Future patches can enable it on targets that don't
have the intrinsic, and select it in cases where the libcall isn't
directly usable. As the memset.pattern intrinsic takes the number of
times to store the pattern as an argument unlike memset_pattern16 which
takes the number of bytes to write, we no longer try to form an i128
pattern.

Special care is taken for cases where multiple stores in the same loop
iteration were combined to form a single pattern. For such cases, we
inherit the limitation that loops such as the following are supported:

```
for (unsigned i = 0; i < 2 * n; i += 2) {
  f[i] = 2;
  f[i+1] = 2;
}
```

But the following doesn't result in a memset.pattern (even though it
could be, by forming an appropriate pattern):
```
for (unsigned i = 0; i < 2 * n; i += 2) {
  f[i] = 2;
  f[i+1] = 3;
}
```

Addressing this existing deficiency is left for a follow-up due to a
desire not to change too much at once (i.e. to target equivalence to the
current codegen).

A command line option is introduced to force the selection of the
intrinsic even in cases it wouldn't be (i.e. in cases where the libcall
wouldn't have been selected). This is intended as a transitionary option
for testing and experimentation, to be removed at a later point.

The only platforms this should impact are those that have the memset_pattern16 libcall (Apple platforms). Testing performed to check for no unexpected codegen changes is described here https://github.com/llvm/llvm-project/pull/126736#issuecomment-3005097468
2025-07-09 13:48:15 +01:00

293 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt -passes=loop-idiom < %s -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-apple-darwin10.0.0"
%struct.foo = type { i32, i32 }
%struct.foo1 = type { i32, i32, i32 }
;void bar1(foo_t *f, unsigned n) {
; for (unsigned i = 0; i < n; ++i) {
; f[i].a = 2;
; f[i].b = 2;
; }
;}
define void @bar1(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @bar1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr align 4 [[F:%.*]], i32 2, i64 [[TMP1]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[F]], i64 [[INDVARS_IV]], i32 0
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr [[F]], i64 [[INDVARS_IV]], i32 1
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%cmp1 = icmp eq i32 %n, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%a = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 0
store i32 2, ptr %a, align 4
%b = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 1
store i32 2, ptr %b, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp ne i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.body, label %for.end.loopexit
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void bar2(foo_t *f, unsigned n) {
; for (unsigned i = 0; i < n; ++i) {
; f[i].b = 2;
; f[i].a = 2;
; }
;}
define void @bar2(ptr %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @bar2(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr align 4 [[F:%.*]], i32 2, i64 [[TMP1]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[F]], i64 [[INDVARS_IV]], i32 1
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr [[F]], i64 [[INDVARS_IV]], i32 0
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%cmp1 = icmp eq i32 %n, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%b = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 1
store i32 2, ptr %b, align 4
%a = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 0
store i32 2, ptr %a, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp ne i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.body, label %for.end.loopexit
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void bar3(foo_t *f, unsigned n) {
; for (unsigned i = n; i > 0; --i) {
; f[i].a = 2;
; f[i].b = 2;
; }
;}
define void @bar3(ptr nocapture %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @bar3(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 3
; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP1]], [[TMP4]]
; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[F:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP0]], 2
; CHECK-NEXT: call void @llvm.experimental.memset.pattern.p0.i32.i64(ptr align 4 [[UGLYGEP]], i32 2, i64 [[TMP7]], i1 false)
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[F]], i64 [[INDVARS_IV]], i32 0
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr [[F]], i64 [[INDVARS_IV]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
; CHECK-NEXT: [[DEC:%.*]] = add i32 [[TMP6]], -1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%cmp1 = icmp eq i32 %n, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
%0 = zext i32 %n to i64
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%a = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 0
store i32 2, ptr %a, align 4
%b = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 1
store i32 2, ptr %b, align 4
%1 = trunc i64 %indvars.iv to i32
%dec = add i32 %1, -1
%cmp = icmp eq i32 %dec, 0
%indvars.iv.next = add nsw i64 %indvars.iv, -1
br i1 %cmp, label %for.end.loopexit, label %for.body
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void bar4(foo_t *f, unsigned n) {
; for (unsigned i = 0; i < n; ++i) {
; f[i].a = 0;
; f[i].b = 1;
; }
;}
define void @bar4(ptr nocapture %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @bar4(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO:%.*]], ptr [[F:%.*]], i64 [[INDVARS_IV]], i32 0
; CHECK-NEXT: store i32 0, ptr [[A]], align 4
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], ptr [[F]], i64 [[INDVARS_IV]], i32 1
; CHECK-NEXT: store i32 1, ptr [[B]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%cmp1 = icmp eq i32 %n, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%a = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 0
store i32 0, ptr %a, align 4
%b = getelementptr inbounds %struct.foo, ptr %f, i64 %indvars.iv, i32 1
store i32 1, ptr %b, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp ne i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.body, label %for.end.loopexit
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;void bar5(foo1_t *f, unsigned n) {
; for (unsigned i = 0; i < n; ++i) {
; f[i].a = 1;
; f[i].b = 1;
; }
;}
define void @bar5(ptr nocapture %f, i32 %n) nounwind ssp {
; CHECK-LABEL: @bar5(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
; CHECK: for.body.preheader:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO1:%.*]], ptr [[F:%.*]], i64 [[INDVARS_IV]], i32 0
; CHECK-NEXT: store i32 1, ptr [[A]], align 4
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO1]], ptr [[F]], i64 [[INDVARS_IV]], i32 1
; CHECK-NEXT: store i32 1, ptr [[B]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
entry:
%cmp1 = icmp eq i32 %n, 0
br i1 %cmp1, label %for.end, label %for.body.preheader
for.body.preheader: ; preds = %entry
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
%a = getelementptr inbounds %struct.foo1, ptr %f, i64 %indvars.iv, i32 0
store i32 1, ptr %a, align 4
%b = getelementptr inbounds %struct.foo1, ptr %f, i64 %indvars.iv, i32 1
store i32 1, ptr %b, align 4
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
%exitcond = icmp ne i32 %lftr.wideiv, %n
br i1 %exitcond, label %for.body, label %for.end.loopexit
for.end.loopexit: ; preds = %for.body
br label %for.end
for.end: ; preds = %for.end.loopexit, %entry
ret void
}
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nounwind ssp }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
;.