Luke Lau fe4f6c1a58
[RISCV] Cost bf16/f16 vector non-unit memory accesses as legal without zvfhmin/zvfbfmin (#150882)
When vectorizing with predication some loops that were previously
vectorized without zvfhmin/zvfbfmin will no longer be vectorized because
the masked load/store or gather/scatter cost returns illegal.

This is due to a discrepancy where for these costs we check
isLegalElementTypeForRVV but for regular memory accesses we don't.

But for bf16 and f16 vectors we don't actually need the extension
support for loads and stores, so this adds a new function which takes
this into account.

For regular memory accesses we should probably also e.g. return an
invalid cost for i64 elements on zve32x, but it doesn't look like we
have tests for this yet.

We also should probably not be vectorizing these bf16/f16 loops to begin
with if we don't have zvfhmin/zvfbfmin and zfhmin/zfbfmin. I think this
is due to the scalar costs being too cheap. I've added tests for this in
a100f6367205c6a909d68027af6a8675a8091bd9 to fix in another patch.
2025-07-28 22:59:49 +08:00

144 lines
9.4 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN
; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFHMIN-PREDICATED
; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN
define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
; NO-ZVFHMIN-LABEL: define void @fadd(
; NO-ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; NO-ZVFHMIN-NEXT: [[ENTRY:.*]]:
; NO-ZVFHMIN-NEXT: br label %[[LOOP:.*]]
; NO-ZVFHMIN: [[LOOP]]:
; NO-ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
; NO-ZVFHMIN-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
; NO-ZVFHMIN-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
; NO-ZVFHMIN-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2
; NO-ZVFHMIN-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
; NO-ZVFHMIN-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]]
; NO-ZVFHMIN-NEXT: store half [[Z]], ptr [[A_GEP]], align 2
; NO-ZVFHMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1
; NO-ZVFHMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
; NO-ZVFHMIN-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
; NO-ZVFHMIN: [[EXIT]]:
; NO-ZVFHMIN-NEXT: ret void
;
; NO-ZVFHMIN-PREDICATED-LABEL: define void @fadd(
; NO-ZVFHMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; NO-ZVFHMIN-PREDICATED-NEXT: [[ENTRY:.*]]:
; NO-ZVFHMIN-PREDICATED-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; NO-ZVFHMIN-PREDICATED: [[VECTOR_PH]]:
; NO-ZVFHMIN-PREDICATED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 15
; NO-ZVFHMIN-PREDICATED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
; NO-ZVFHMIN-PREDICATED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; NO-ZVFHMIN-PREDICATED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
; NO-ZVFHMIN-PREDICATED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
; NO-ZVFHMIN-PREDICATED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
; NO-ZVFHMIN-PREDICATED-NEXT: br label %[[VECTOR_BODY:.*]]
; NO-ZVFHMIN-PREDICATED: [[VECTOR_BODY]]:
; NO-ZVFHMIN-PREDICATED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; NO-ZVFHMIN-PREDICATED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
; NO-ZVFHMIN-PREDICATED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
; NO-ZVFHMIN-PREDICATED-NEXT: [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
; NO-ZVFHMIN-PREDICATED-NEXT: [[TMP0:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; NO-ZVFHMIN-PREDICATED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
; NO-ZVFHMIN-PREDICATED-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
; NO-ZVFHMIN-PREDICATED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x half> @llvm.masked.load.v16f16.p0(ptr [[TMP1]], i32 2, <16 x i1> [[TMP0]], <16 x half> poison)
; NO-ZVFHMIN-PREDICATED-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <16 x half> @llvm.masked.load.v16f16.p0(ptr [[TMP2]], i32 2, <16 x i1> [[TMP0]], <16 x half> poison)
; NO-ZVFHMIN-PREDICATED-NEXT: [[TMP3:%.*]] = fadd <16 x half> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]]
; NO-ZVFHMIN-PREDICATED-NEXT: call void @llvm.masked.store.v16f16.p0(<16 x half> [[TMP3]], ptr [[TMP1]], i32 2, <16 x i1> [[TMP0]])
; NO-ZVFHMIN-PREDICATED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16
; NO-ZVFHMIN-PREDICATED-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NO-ZVFHMIN-PREDICATED-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; NO-ZVFHMIN-PREDICATED: [[MIDDLE_BLOCK]]:
; NO-ZVFHMIN-PREDICATED-NEXT: br label %[[EXIT:.*]]
; NO-ZVFHMIN-PREDICATED: [[SCALAR_PH]]:
; NO-ZVFHMIN-PREDICATED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
; NO-ZVFHMIN-PREDICATED-NEXT: br label %[[LOOP:.*]]
; NO-ZVFHMIN-PREDICATED: [[LOOP]]:
; NO-ZVFHMIN-PREDICATED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
; NO-ZVFHMIN-PREDICATED-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
; NO-ZVFHMIN-PREDICATED-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
; NO-ZVFHMIN-PREDICATED-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2
; NO-ZVFHMIN-PREDICATED-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
; NO-ZVFHMIN-PREDICATED-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]]
; NO-ZVFHMIN-PREDICATED-NEXT: store half [[Z]], ptr [[A_GEP]], align 2
; NO-ZVFHMIN-PREDICATED-NEXT: [[I_NEXT]] = add i64 [[I]], 1
; NO-ZVFHMIN-PREDICATED-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
; NO-ZVFHMIN-PREDICATED-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; NO-ZVFHMIN-PREDICATED: [[EXIT]]:
; NO-ZVFHMIN-PREDICATED-NEXT: ret void
;
; ZVFHMIN-LABEL: define void @fadd(
; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
; ZVFHMIN-NEXT: [[ENTRY:.*]]:
; ZVFHMIN-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 8
; ZVFHMIN-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
; ZVFHMIN-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; ZVFHMIN: [[VECTOR_PH]]:
; ZVFHMIN-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 8
; ZVFHMIN-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
; ZVFHMIN-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; ZVFHMIN-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; ZVFHMIN-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP12]], 8
; ZVFHMIN-NEXT: br label %[[VECTOR_BODY:.*]]
; ZVFHMIN: [[VECTOR_BODY]]:
; ZVFHMIN-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; ZVFHMIN-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
; ZVFHMIN-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
; ZVFHMIN-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP1]], align 2
; ZVFHMIN-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP2]], align 2
; ZVFHMIN-NEXT: [[TMP11:%.*]] = fadd <vscale x 8 x half> [[WIDE_LOAD]], [[WIDE_LOAD1]]
; ZVFHMIN-NEXT: store <vscale x 8 x half> [[TMP11]], ptr [[TMP1]], align 2
; ZVFHMIN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; ZVFHMIN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; ZVFHMIN-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; ZVFHMIN: [[MIDDLE_BLOCK]]:
; ZVFHMIN-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; ZVFHMIN-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
; ZVFHMIN: [[SCALAR_PH]]:
; ZVFHMIN-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
; ZVFHMIN-NEXT: br label %[[LOOP:.*]]
; ZVFHMIN: [[LOOP]]:
; ZVFHMIN-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
; ZVFHMIN-NEXT: [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
; ZVFHMIN-NEXT: [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
; ZVFHMIN-NEXT: [[X:%.*]] = load half, ptr [[A_GEP]], align 2
; ZVFHMIN-NEXT: [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
; ZVFHMIN-NEXT: [[Z:%.*]] = fadd half [[X]], [[Y]]
; ZVFHMIN-NEXT: store half [[Z]], ptr [[A_GEP]], align 2
; ZVFHMIN-NEXT: [[I_NEXT]] = add i64 [[I]], 1
; ZVFHMIN-NEXT: [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
; ZVFHMIN-NEXT: br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
; ZVFHMIN: [[EXIT]]:
; ZVFHMIN-NEXT: ret void
;
entry:
br label %loop
loop:
%i = phi i64 [0, %entry], [%i.next, %loop]
%a.gep = getelementptr half, ptr %a, i64 %i
%b.gep = getelementptr half, ptr %b, i64 %i
%x = load half, ptr %a.gep
%y = load half, ptr %b.gep
%z = fadd half %x, %y
store half %z, ptr %a.gep
%i.next = add i64 %i, 1
%done = icmp eq i64 %i.next, %n
br i1 %done, label %exit, label %loop
exit:
ret void
}
;.
; NO-ZVFHMIN-PREDICATED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; NO-ZVFHMIN-PREDICATED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; NO-ZVFHMIN-PREDICATED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; NO-ZVFHMIN-PREDICATED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
;.
; ZVFHMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; ZVFHMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; ZVFHMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
; ZVFHMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
;.