vaibhav 384a5b00a7
[LAA] Use MaxStride instead of CommonStride to calculate MaxVF (#98142)
We bail out from MaxVF calculation if the strides are not the same.
Instead, we are dependent on runtime checks, though not yet implemented.
We could instead use the MaxStride to conservatively use an upper bound.

This handles cases like the following:
```c
#define LEN 256 * 256
float a[LEN];

void gather() {
  for (int i = 0; i < LEN - 1024 - 255; i++) {
  #pragma clang loop interleave(disable)
  #pragma clang loop unroll(disable)
    for (int j = 0; j < 256; j++)
      a[i + j + 1024] += a[j * 4 + i];
  }
}
```

---------

Co-authored-by: Florian Hahn <flo@fhahn.com>
2025-05-07 21:02:21 +01:00

157 lines
5.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
; RUN: opt -passes="print<access-info>" %s 2>&1 | FileCheck %s
@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16
; Generated from the following C code:
; #define LEN 256 * 256
; float a[LEN];
;
; void different_strides() {
; for (int i = 0; i < LEN - 1024 - 255; i++) {
; #pragma clang loop interleave(disable)
; #pragma clang loop unroll(disable)
; for (int j = 0; j < 256; j++)
; a[i + j + 1024] += a[j * 4 + i];
; }
; }
; The load and store have different strides(4 and 16 bytes respectively) but the store
; is always at safe positive distance away from the load, thus BackwardVectorizable
define void @different_strides_backward_vectorizable() {
; CHECK-LABEL: 'different_strides_backward_vectorizable'
; CHECK-NEXT: inner.body:
; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits
; CHECK-NEXT: Dependences:
; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Forward:
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
; CHECK-NEXT: outer.header:
; CHECK-NEXT: Report: loop is not the innermost loop
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
entry:
br label %outer.header
outer.header:
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
%0 = add nuw nsw i64 %i, 1024
br label %inner.body
inner.body:
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
%1 = shl nuw nsw i64 %j, 2
%2 = add nuw nsw i64 %1, %i
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
%3 = load float, ptr %arrayidx, align 4
%4 = add nuw nsw i64 %0, %j
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
%5 = load float, ptr %arrayidx8, align 4
%add9 = fadd fast float %5, %3
store float %add9, ptr %arrayidx8, align 4
%j.next = add nuw nsw i64 %j, 1
%exitcond.not = icmp eq i64 %j.next, 256
br i1 %exitcond.not, label %outer.latch, label %inner.body
outer.latch:
%i.next = add nuw nsw i64 %i, 1
%outerexitcond.not = icmp eq i64 %i.next, 64257
br i1 %outerexitcond.not, label %exit, label %outer.header
exit:
ret void
}
; Generated from following C code:
; void different_stride_and_not_vectorizable(){
; for(int i = 0; i < LEN2; i++){
; for(int j = 0 ; j < LEN; j++){
; a[i + j + LEN] += a[i + 4*j];
; }
; }
; }
; The load and store have different strides, but the store and load are not at a
; safe distance away from each other, thus not safe for vectorization.
define void @different_stride_and_not_vectorizable() {
; CHECK-LABEL: 'different_stride_and_not_vectorizable'
; CHECK-NEXT: inner.body:
; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
; CHECK-NEXT: Unknown data dependence.
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Unknown:
; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Forward:
; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 ->
; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4
; CHECK-EMPTY:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
; CHECK-NEXT: outer.header:
; CHECK-NEXT: Report: loop is not the innermost loop
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Run-time memory checks:
; CHECK-NEXT: Grouped accesses:
; CHECK-EMPTY:
; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop.
; CHECK-NEXT: SCEV assumptions:
; CHECK-EMPTY:
; CHECK-NEXT: Expressions re-written:
;
entry:
br label %outer.header
outer.header:
%i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ]
%0 = add nuw nsw i64 %i, 256
br label %inner.body
inner.body:
%j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ]
%1 = shl nuw nsw i64 %j, 2
%2 = add nuw nsw i64 %1, %i
%arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2
%3 = load float, ptr %arrayidx, align 4
%4 = add nuw nsw i64 %0, %j
%arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4
%5 = load float, ptr %arrayidx8, align 4
%add9 = fadd fast float %5, %3
store float %add9, ptr %arrayidx8, align 4
%j.next = add nuw nsw i64 %j, 1
%exitcond.not = icmp eq i64 %j.next, 256
br i1 %exitcond.not, label %outer.latch, label %inner.body
outer.latch:
%i.next = add nuw nsw i64 %i, 1
%exitcond29.not = icmp eq i64 %i.next, 65536
br i1 %exitcond29.not, label %exit, label %outer.header
exit:
ret void
}