[RISCV] Cost bf16/f16 vector non-unit memory accesses as legal without zvfhmin/zvfbfmin (#150882)

When vectorizing with predication some loops that were previously vectorized without zvfhmin/zvfbfmin will no longer be vectorized because the masked load/store or gather/scatter cost returns illegal. This is due to a discrepancy where for these costs we check isLegalElementTypeForRVV but for regular memory accesses we don't. But for bf16 and f16 vectors we don't actually need the extension support for loads and stores, so this adds a new function which takes this into account. For regular memory accesses we should probably also e.g. return an invalid cost for i64 elements on zve32x, but it doesn't look like we have tests for this yet. We also should probably not be vectorizing these bf16/f16 loops to begin with if we don't have zvfhmin/zvfbfmin and zfhmin/zfbfmin. I think this is due to the scalar costs being too cheap. I've added tests for this in a100f6367205c6a909d68027af6a8675a8091bd9 to fix in another patch.
2025-07-28 22:59:49 +08:00 · 2025-07-28 22:59:49 +08:00 · fe4f6c1a58
commit fe4f6c1a58
parent 4b1d5b8d4f
8 changed files with 266 additions and 8 deletions
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@ -2739,6 +2739,27 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
  }
 }

+bool RISCVTargetLowering::isLegalLoadStoreElementTypeForRVV(
+    EVT ScalarTy) const {
+  if (!ScalarTy.isSimple())
+    return false;
+  switch (ScalarTy.getSimpleVT().SimpleTy) {
+  case MVT::iPTR:
+    return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::f16:
+  case MVT::bf16:
+  case MVT::f32:
+    return true;
+  case MVT::i64:
+  case MVT::f64:
+    return Subtarget.hasVInstructionsI64();
+  default:
+    return false;
+  }
+}

 unsigned RISCVTargetLowering::combineRepeatedFPDivisors() const {
  return NumRepeatedDivisors;
@ -24239,7 +24260,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
    return false;

  EVT ScalarType = DataType.getScalarType();
-  if (!isLegalElementTypeForRVV(ScalarType))
+  if (!isLegalLoadStoreElementTypeForRVV(ScalarType))
    return false;

  if (!Subtarget.enableUnalignedVectorMem() &&
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@ -384,6 +384,7 @@ public:
  bool shouldRemoveExtendFromGSIndex(SDValue Extend, EVT DataVT) const override;

  bool isLegalElementTypeForRVV(EVT ScalarTy) const;
+  bool isLegalLoadStoreElementTypeForRVV(EVT ScalarTy) const;

  bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;

--- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
@ -32,7 +32,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
  if (!isTypeLegal(VT))
    return false;

-  if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
+  if (!isLegalLoadStoreElementTypeForRVV(VT.getScalarType()) ||
      !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
                                      Alignment))
    return false;
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@ -265,7 +265,7 @@ public:
    if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
      return false;

-    return TLI->isLegalElementTypeForRVV(ElemType);
+    return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
  }

  bool isLegalMaskedLoad(Type *DataType, Align Alignment,
@ -297,7 +297,7 @@ public:
    if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
      return false;

-    return TLI->isLegalElementTypeForRVV(ElemType);
+    return TLI->isLegalLoadStoreElementTypeForRVV(ElemType);
  }

  bool isLegalMaskedGather(Type *DataType, Align Alignment) const override {
--- a/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/masked_ldst.ll
@ -13,14 +13,14 @@ define void @fixed() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 39 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 223 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 entry:
--- a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll
@ -874,3 +874,79 @@ define void @load_factor2_fp128(ptr %ptr) {
  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> <i32 1, i32 3>
  ret void
 }
+
+define void @load_factor2_f32(ptr %ptr) {
+; RV32-LABEL: @load_factor2_f32(
+; RV32-NEXT:    [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8)
+; RV32-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_f32(
+; RV64-NEXT:    [[TMP1:%.*]] = call { <8 x float>, <8 x float> } @llvm.riscv.seg2.load.mask.v8f32.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8)
+; RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 1
+; RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x float>, <8 x float> } [[TMP1]], 0
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x float>, ptr %ptr
+  %v0 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x float> %interleaved.vec, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_f64(ptr %ptr) {
+; RV32-LABEL: @load_factor2_f64(
+; RV32-NEXT:    [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i32(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i32 8)
+; RV32-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
+; RV32-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_f64(
+; RV64-NEXT:    [[TMP1:%.*]] = call { <8 x double>, <8 x double> } @llvm.riscv.seg2.load.mask.v8f64.p0.i64(ptr [[PTR:%.*]], <8 x i1> splat (i1 true), i64 8)
+; RV64-NEXT:    [[TMP2:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 1
+; RV64-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x double>, <8 x double> } [[TMP1]], 0
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x double>, ptr %ptr
+  %v0 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x double> %interleaved.vec, <16 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_bf16(ptr %ptr) {
+; RV32-LABEL: @load_factor2_bf16(
+; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32
+; RV32-NEXT:    [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV32-NEXT:    [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_bf16(
+; RV64-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x bfloat>, ptr [[PTR:%.*]], align 32
+; RV64-NEXT:    [[V0:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV64-NEXT:    [[V1:%.*]] = shufflevector <16 x bfloat> [[INTERLEAVED_VEC]], <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x bfloat>, ptr %ptr
+  %v0 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x bfloat> %interleaved.vec, <16 x bfloat> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_f16(ptr %ptr) {
+; RV32-LABEL: @load_factor2_f16(
+; RV32-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32
+; RV32-NEXT:    [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV32-NEXT:    [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV32-NEXT:    ret void
+;
+; RV64-LABEL: @load_factor2_f16(
+; RV64-NEXT:    [[INTERLEAVED_VEC:%.*]] = load <16 x half>, ptr [[PTR:%.*]], align 32
+; RV64-NEXT:    [[V0:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; RV64-NEXT:    [[V1:%.*]] = shufflevector <16 x half> [[INTERLEAVED_VEC]], <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; RV64-NEXT:    ret void
+;
+  %interleaved.vec = load <16 x half>, ptr %ptr
+  %v0 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x half> %interleaved.vec, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
--- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFBFMIN-PREDICATED
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN

 define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
@ -21,6 +22,52 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; NO-ZVFBFMIN:       [[EXIT]]:
 ; NO-ZVFBFMIN-NEXT:    ret void
 ;
+; NO-ZVFBFMIN-PREDICATED-LABEL: define void @fadd(
+; NO-ZVFBFMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFBFMIN-PREDICATED-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[VECTOR_PH]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[VECTOR_BODY]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[INDEX]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[INDEX]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x bfloat> @llvm.masked.load.v16bf16.p0(ptr [[TMP1]], i32 2, <16 x i1> [[TMP0]], <16 x bfloat> poison)
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x bfloat> @llvm.masked.load.v16bf16.p0(ptr [[TMP2]], i32 2, <16 x i1> [[TMP0]], <16 x bfloat> poison)
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP3:%.*]] = fadd <16 x bfloat> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    call void @llvm.masked.store.v16bf16.p0(<16 x bfloat> [[TMP3]], ptr [[TMP1]], i32 2, <16 x i1> [[TMP0]])
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-ZVFBFMIN-PREDICATED:       [[MIDDLE_BLOCK]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[EXIT:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[SCALAR_PH]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[LOOP]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-ZVFBFMIN-PREDICATED:       [[EXIT]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    ret void
+;
 ; ZVFBFMIN-LABEL: define void @fadd(
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
@ -133,6 +180,60 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; NO-ZVFBFMIN:       [[EXIT]]:
 ; NO-ZVFBFMIN-NEXT:    ret void
 ;
+; NO-ZVFBFMIN-PREDICATED-LABEL: define void @vfwmaccbf16.vv(
+; NO-ZVFBFMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-ZVFBFMIN-PREDICATED-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[VECTOR_PH]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 3
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[VECTOR_BODY]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[I]], i64 0
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x bfloat> @llvm.masked.load.v4bf16.p0(ptr [[A_GEP]], i32 2, <4 x i1> [[TMP0]], <4 x bfloat> poison)
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <4 x bfloat> @llvm.masked.load.v4bf16.p0(ptr [[B_GEP]], i32 2, <4 x i1> [[TMP0]], <4 x bfloat> poison)
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0(ptr [[C_GEP]], i32 4, <4 x i1> [[TMP0]], <4 x float> poison)
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP4:%.*]] = fpext <4 x bfloat> [[WIDE_MASKED_LOAD]] to <4 x float>
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP5:%.*]] = fpext <4 x bfloat> [[WIDE_MASKED_LOAD3]] to <4 x float>
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP6:%.*]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x float> [[WIDE_MASKED_LOAD4]])
+; NO-ZVFBFMIN-PREDICATED-NEXT:    call void @llvm.masked.store.v4f32.p0(<4 x float> [[TMP6]], ptr [[C_GEP]], i32 4, <4 x i1> [[TMP0]])
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[INDEX_NEXT]] = add i64 [[I]], 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; NO-ZVFBFMIN-PREDICATED:       [[MIDDLE_BLOCK]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[EXIT:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[SCALAR_PH]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN-PREDICATED:       [[LOOP]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[A_GEP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I1]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[B_GEP1:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I1]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[C_GEP1:%.*]] = getelementptr float, ptr [[C]], i64 [[I1]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP1]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP1]], align 2
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP1]], align 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; NO-ZVFBFMIN-PREDICATED-NEXT:    store float [[FMULADD]], ptr [[C_GEP1]], align 4
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[I_NEXT]] = add i64 [[I1]], 1
+; NO-ZVFBFMIN-PREDICATED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-PREDICATED-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; NO-ZVFBFMIN-PREDICATED:       [[EXIT]]:
+; NO-ZVFBFMIN-PREDICATED-NEXT:    ret void
+;
 ; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
@ -213,6 +314,13 @@ exit:
 ; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ;.
+; NO-ZVFBFMIN-PREDICATED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-ZVFBFMIN-PREDICATED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-ZVFBFMIN-PREDICATED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-ZVFBFMIN-PREDICATED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; NO-ZVFBFMIN-PREDICATED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; NO-ZVFBFMIN-PREDICATED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
 ; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
--- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue | FileCheck %s -check-prefix=NO-ZVFHMIN-PREDICATED
 ; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN

 define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
@ -21,6 +22,52 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; NO-ZVFHMIN:       [[EXIT]]:
 ; NO-ZVFHMIN-NEXT:    ret void
 ;
+; NO-ZVFHMIN-PREDICATED-LABEL: define void @fadd(
+; NO-ZVFHMIN-PREDICATED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFHMIN-PREDICATED-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFHMIN-PREDICATED:       [[VECTOR_PH]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N]], 15
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 16
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
+; NO-ZVFHMIN-PREDICATED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-ZVFHMIN-PREDICATED:       [[VECTOR_BODY]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[INDEX]], i64 0
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[VEC_IV:%.*]] = add <16 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[TMP0:%.*]] = icmp ule <16 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x half> @llvm.masked.load.v16f16.p0(ptr [[TMP1]], i32 2, <16 x i1> [[TMP0]], <16 x half> poison)
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <16 x half> @llvm.masked.load.v16f16.p0(ptr [[TMP2]], i32 2, <16 x i1> [[TMP0]], <16 x half> poison)
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[TMP3:%.*]] = fadd <16 x half> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD3]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    call void @llvm.masked.store.v16f16.p0(<16 x half> [[TMP3]], ptr [[TMP1]], i32 2, <16 x i1> [[TMP0]])
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-ZVFHMIN-PREDICATED:       [[MIDDLE_BLOCK]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    br label %[[EXIT:.*]]
+; NO-ZVFHMIN-PREDICATED:       [[SCALAR_PH]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ]
+; NO-ZVFHMIN-PREDICATED-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFHMIN-PREDICATED:       [[LOOP]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[X:%.*]] = load half, ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[Z:%.*]] = fadd half [[X]], [[Y]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    store half [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFHMIN-PREDICATED-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFHMIN-PREDICATED-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-ZVFHMIN-PREDICATED:       [[EXIT]]:
+; NO-ZVFHMIN-PREDICATED-NEXT:    ret void
+;
 ; ZVFHMIN-LABEL: define void @fadd(
 ; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFHMIN-NEXT:  [[ENTRY:.*]]:
@ -84,6 +131,11 @@ exit:
  ret void
 }
 ;.
+; NO-ZVFHMIN-PREDICATED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-ZVFHMIN-PREDICATED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-ZVFHMIN-PREDICATED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-ZVFHMIN-PREDICATED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
 ; ZVFHMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; ZVFHMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; ZVFHMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}