From bfb6f47e9ea463555833934ef714b03ee78eb01e Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Mon, 28 Nov 2022 11:43:51 +0000
Subject: [PATCH] [SVE] Change some bfloat lane intrinsics to use i32
 immediates

Almost all of the other SVE LLVM IR intrinsics take i32 values
for lane indices or other immediates. We should bring the bfloat
intrinsics in line with that. It will also make it easier to
add support for the SVE2.1 float intrinsics in future, since
they reuse the same underlying instruction classes.

I've maintained backwards compatibility with the old i64 variants
and used the autoupgrade mechanism.

Differential Revision: https://reviews.llvm.org/D138788
---
 clang/include/clang/Basic/arm_sve.td          |  6 +--
 .../aarch64-sve-intrinsics/acle_sve_bfdot.c   |  8 ++--
 .../aarch64-sve-intrinsics/acle_sve_bfmlalb.c |  8 ++--
 .../aarch64-sve-intrinsics/acle_sve_bfmlalt.c |  8 ++--
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  8 ++--
 llvm/lib/IR/AutoUpgrade.cpp                   | 25 ++++++++++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  8 ++--
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 20 ++++----
 .../Bitcode/upgrade-aarch64-sve-intrinsics.ll | 30 ++++++++++++
 .../CodeGen/AArch64/sve-intrinsics-bfloat.ll  | 46 +++++++++----------
 10 files changed, 111 insertions(+), 56 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 175b572ffdab..6c24f0423238 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -537,9 +537,9 @@ let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in {
   def SVBFDOT_N      : SInst<"svbfdot[_n_{0}]",      "MMda",  "b", MergeNone, "aarch64_sve_bfdot",        [IsOverloadNone]>;
   def SVBFMLAL_N     : SInst<"svbfmlalb[_n_{0}]",    "MMda",  "b", MergeNone, "aarch64_sve_bfmlalb",      [IsOverloadNone]>;
   def SVBFMLALT_N    : SInst<"svbfmlalt[_n_{0}]",    "MMda",  "b", MergeNone, "aarch64_sve_bfmlalt",      [IsOverloadNone]>;
-  def SVBFDOT_LANE   : SInst<"svbfdot_lane[_{0}]",   "MMddn", "b", MergeNone, "aarch64_sve_bfdot_lane",   [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalb_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalt_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFDOT_LANE   : SInst<"svbfdot_lane[_{0}]",   "MMddi", "b", MergeNone, "aarch64_sve_bfdot_lane_v2",   [IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVBFMLALB_LANE : SInst<"svbfmlalb_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalb_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddi", "b", MergeNone, "aarch64_sve_bfmlalt_lane_v2", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
index 7735a3173d38..454b4b546a9d 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfdot.c
@@ -31,12 +31,12 @@ svfloat32_t test_bfdot_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
 
 // CHECK-LABEL: @test_bfdot_lane_0_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 0)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_bfdot_lane_0_f32u13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfdot_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
@@ -45,12 +45,12 @@ svfloat32_t test_bfdot_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z)
 
 // CHECK-LABEL: @test_bfdot_lane_3_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 3)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z21test_bfdot_lane_3_f32u13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 3)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 3)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfdot_lane_3_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
index 0727ffbb93cf..c736cf8104c9 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalb.c
@@ -31,12 +31,12 @@ svfloat32_t test_svbfmlalb_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
 
 // CHECK-LABEL: @test_bfmlalb_lane_0_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 0)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_bfmlalb_lane_0_f32u13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfmlalb_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
@@ -45,12 +45,12 @@ svfloat32_t test_bfmlalb_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t
 
 // CHECK-LABEL: @test_bfmlalb_lane_7_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 7)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_bfmlalb_lane_7_f32u13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 7)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfmlalb_lane_7_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
index 24e2da9aa470..7888043e820f 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_bfmlalt.c
@@ -31,12 +31,12 @@ svfloat32_t test_svbfmlalt_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
 
 // CHECK-LABEL: @test_bfmlalt_lane_0_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 0)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_bfmlalt_lane_0_f32u13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfmlalt_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
@@ -45,12 +45,12 @@ svfloat32_t test_bfmlalt_lane_0_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t
 
 // CHECK-LABEL: @test_bfmlalt_lane_7_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 7)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: @_Z23test_bfmlalt_lane_7_f32u13__SVFloat32_tu14__SVBFloat16_tu14__SVBFloat16_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i64 7)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> [[X:%.*]], <vscale x 8 x bfloat> [[Y:%.*]], <vscale x 8 x bfloat> [[Z:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_bfmlalt_lane_7_f32(svfloat32_t x, svbfloat16_t y, svbfloat16_t z) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 960200ddf9d9..c65446e8bafa 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1516,7 +1516,7 @@ class SVE_4Vec_BF16
 
 class SVE_4Vec_BF16_Indexed
     : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
-                [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i64_ty],
+                [llvm_nxv4f32_ty, llvm_nxv8bf16_ty, llvm_nxv8bf16_ty, llvm_i32_ty],
                 [IntrNoMem, ImmArg<ArgIndex<3>>]>;
 
 //
@@ -2519,9 +2519,9 @@ def int_aarch64_sve_bfmlalt : SVE_4Vec_BF16;
 
 def int_aarch64_sve_bfmmla  : SVE_4Vec_BF16;
 
-def int_aarch64_sve_bfdot_lane   : SVE_4Vec_BF16_Indexed;
-def int_aarch64_sve_bfmlalb_lane : SVE_4Vec_BF16_Indexed;
-def int_aarch64_sve_bfmlalt_lane : SVE_4Vec_BF16_Indexed;
+def int_aarch64_sve_bfdot_lane_v2   : SVE_4Vec_BF16_Indexed;
+def int_aarch64_sve_bfmlalb_lane_v2 : SVE_4Vec_BF16_Indexed;
+def int_aarch64_sve_bfmlalt_lane_v2 : SVE_4Vec_BF16_Indexed;
 }
 
 //
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index b5c7818dfae2..ef4aac5b0cde 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -605,6 +605,21 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
                                         F->arg_begin()->getType());
       return true;
     }
+    if (Name == "aarch64.sve.bfdot.lane") {
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::aarch64_sve_bfdot_lane_v2);
+      return true;
+    }
+    if (Name == "aarch64.sve.bfmlalb.lane") {
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::aarch64_sve_bfmlalb_lane_v2);
+      return true;
+    }
+    if (Name == "aarch64.sve.bfmlalt.lane") {
+      NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                        Intrinsic::aarch64_sve_bfmlalt_lane_v2);
+      return true;
+    }
     static const Regex LdRegex("^aarch64\\.sve\\.ld[234](.nxv[a-z0-9]+|$)");
     if (LdRegex.match(Name)) {
       Type *ScalarTy =
@@ -3955,6 +3970,16 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
+  case Intrinsic::aarch64_sve_bfmlalb_lane_v2:
+  case Intrinsic::aarch64_sve_bfmlalt_lane_v2:
+  case Intrinsic::aarch64_sve_bfdot_lane_v2: {
+    LLVMContext &Ctx = F->getParent()->getContext();
+    SmallVector<Value *, 4> Args(CI->args());
+    Args[3] = ConstantInt::get(Type::getInt32Ty(Ctx),
+                               cast<ConstantInt>(Args[3])->getZExtValue());
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
   case Intrinsic::aarch64_sve_ld3_sret:
   case Intrinsic::aarch64_sve_ld4_sret:
   case Intrinsic::aarch64_sve_ld2_sret: {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 597c7a27cf33..7cc525e698f5 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2162,8 +2162,8 @@ let Predicates = [HasSVEorSME] in {
 } // End HasSVEorSME
 
 let Predicates = [HasBF16, HasSVEorSME] in {
-  defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
-  defm BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
+  defm BFDOT_ZZZ    : sve_float_dot<0b1, "bfdot", nxv8bf16, int_aarch64_sve_bfdot>;
+  defm BFDOT_ZZI    : sve_float_dot_indexed<0b1, "bfdot", nxv8bf16, int_aarch64_sve_bfdot_lane_v2>;
 } // End HasBF16, HasSVEorSME
 
 let Predicates = [HasBF16, HasSVE] in {
@@ -2173,8 +2173,8 @@ let Predicates = [HasBF16, HasSVE] in {
 let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFMLALB_ZZZ : sve_bfloat_matmul_longvecl<0b0, 0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
   defm BFMLALT_ZZZ : sve_bfloat_matmul_longvecl<0b1, 0b0, "bfmlalt", int_aarch64_sve_bfmlalt>;
-  defm BFMLALB_ZZZI : sve_bfloat_matmul_longvecl_idx<0b0, 0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
-  defm BFMLALT_ZZZI : sve_bfloat_matmul_longvecl_idx<0b1, 0b0, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+  defm BFMLALB_ZZZI : sve_bfloat_matmul_longvecl_idx<0b0, 0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane_v2>;
+  defm BFMLALT_ZZZI : sve_bfloat_matmul_longvecl_idx<0b1, 0b0, "bfmlalt", int_aarch64_sve_bfmlalt_lane_v2>;
   defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
 } // End HasBF16, HasSVEorSME
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index cfccbd766709..a0fa88200d95 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -8317,13 +8317,13 @@ class sve_float_dot<bit bf, string asm>
   let DestructiveInstType = DestructiveOther;
 }
 
-multiclass sve_bfloat_dot<string asm, SDPatternOperator op> {
-  def NAME : sve_float_dot<0b1, asm>;
-  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+multiclass sve_float_dot<bit bf, string asm, ValueType InVT, SDPatternOperator op> {
+  def NAME : sve_float_dot<bf, asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
 }
 
 class sve_float_dot_indexed<bit bf, string asm>
-: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop),
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS32b:$iop),
     asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
@@ -8342,9 +8342,9 @@ class sve_float_dot_indexed<bit bf, string asm>
   let DestructiveInstType = DestructiveOther;
 }
 
-multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> {
-  def NAME : sve_float_dot_indexed<0b1, asm>;
-  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>;
+multiclass sve_float_dot_indexed<bit bf, string asm, ValueType InVT, SDPatternOperator op> {
+  def NAME : sve_float_dot_indexed<bf, asm>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, InVT, InVT, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
 }
 
 class sve_bfloat_matmul<string asm>
@@ -8383,7 +8383,7 @@ multiclass sve_bfloat_matmul_longvecl<bit BT, bit sub, string asm, SDPatternOper
 }
 
 class sve_bfloat_matmul_longvecl_idx<bit BT, bit sub, string asm>
-: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop),
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH32b:$iop),
     asm, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
@@ -8407,7 +8407,7 @@ class sve_bfloat_matmul_longvecl_idx<bit BT, bit sub, string asm>
 
 multiclass sve_bfloat_matmul_longvecl_idx<bit BT, bit sub, string asm, SDPatternOperator op> {
   def NAME : sve_bfloat_matmul_longvecl_idx<BT, sub, asm>;
-  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME)>;
 }
 
 class sve_bfloat_convert<bit N, string asm>
@@ -8784,7 +8784,7 @@ class sve2p1_two_way_dot_vv<string mnemonic, bit u>
 
 // SVE two-way dot product (indexed)
 class sve2p1_two_way_dot_vvi<string mnemonic, bit u>
-    : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$i2),
+    : I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS32b:$i2),
         mnemonic, "\t$Zda, $Zn, $Zm$i2",
         "", []>, Sched<[]> {
   bits<5> Zda;
diff --git a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll
index c21a6b7f1539..234151aa8c36 100644
--- a/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll
+++ b/llvm/test/Bitcode/upgrade-aarch64-sve-intrinsics.ll
@@ -160,6 +160,33 @@ define <vscale x 4 x i32> @get_tuple2_nxv8i32_elt1(<vscale x 8 x i32> %tuple) {
   ret <vscale x 4 x i32> %ext
 }
 
+; bfdot
+define <vscale x 4 x float> @bfdot_lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: @bfdot_lane
+; CHECK:       %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 0)
+; CHECK-NEXT:  ret <vscale x 4 x float> %out
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  ret <vscale x 4 x float> %out
+}
+
+; bfmlalb
+define <vscale x 4 x float> @bfmlalb_lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: @bfmlalb_lane
+; CHECK:       %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 0)
+; CHECK-NEXT:  ret <vscale x 4 x float> %out
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  ret <vscale x 4 x float> %out
+}
+
+; bfmlalt
+define <vscale x 4 x float> @bfmlalt_lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c) nounwind {
+; CHECK-LABEL: @bfmlalt_lane
+; CHECK:       %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 0)
+; CHECK-NEXT:  ret <vscale x 4 x float> %out
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  ret <vscale x 4 x float> %out
+}
+
 declare  <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare  <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
 declare  <vscale x 32 x i8> @llvm.aarch64.sve.tuple.create2(<vscale x 16 x i8>, <vscale x 16 x i8>)
@@ -168,3 +195,6 @@ declare <vscale x 64 x i8> @llvm.aarch64.sve.tuple.create4.nxv64i8.nxv16i8(<vsca
 declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.create1.nxv16i8.nxv16i8(<vscale x 16 x i8>)
 declare <vscale x 8 x i32> @llvm.aarch64.sve.tuple.set.nxv8i32.nxv4i32(<vscale x 8 x i32>, i32, <vscale x 4 x i32>)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
index a3989a97dc98..7d1e63e23a2c 100644
--- a/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
+++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-bfloat.ll
@@ -19,7 +19,7 @@ define <vscale x 4 x float> @bfdot_lane_0_f32(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfdot z0.s, z1.h, z2.h[0]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 0)
   ret <vscale x 4 x float> %out
 }
 
@@ -28,7 +28,7 @@ define <vscale x 4 x float> @bfdot_lane_1_f32(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfdot z0.s, z1.h, z2.h[1]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
   ret <vscale x 4 x float> %out
 }
 
@@ -37,7 +37,7 @@ define <vscale x 4 x float> @bfdot_lane_2_f32(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfdot z0.s, z1.h, z2.h[2]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 2)
   ret <vscale x 4 x float> %out
 }
 
@@ -46,7 +46,7 @@ define <vscale x 4 x float> @bfdot_lane_3_f32(<vscale x 4 x float> %a, <vscale x
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfdot z0.s, z1.h, z2.h[3]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
   ret <vscale x 4 x float> %out
 }
 
@@ -68,7 +68,7 @@ define <vscale x 4 x float> @bfmlalb_lane_0_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[0]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 0)
   ret <vscale x 4 x float> %out
 }
 
@@ -77,7 +77,7 @@ define <vscale x 4 x float> @bfmlalb_lane_1_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[1]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
   ret <vscale x 4 x float> %out
 }
 
@@ -86,7 +86,7 @@ define <vscale x 4 x float> @bfmlalb_lane_2_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[2]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 2)
   ret <vscale x 4 x float> %out
 }
 
@@ -95,7 +95,7 @@ define <vscale x 4 x float> @bfmlalb_lane_3_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[3]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
   ret <vscale x 4 x float> %out
 }
 
@@ -104,7 +104,7 @@ define <vscale x 4 x float> @bfmlalb_lane_4_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[4]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 4)
   ret <vscale x 4 x float> %out
 }
 
@@ -113,7 +113,7 @@ define <vscale x 4 x float> @bfmlalb_lane_5_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[5]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 5)
   ret <vscale x 4 x float> %out
 }
 
@@ -122,7 +122,7 @@ define <vscale x 4 x float> @bfmlalb_lane_6_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[6]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 6)
   ret <vscale x 4 x float> %out
 }
 
@@ -131,7 +131,7 @@ define <vscale x 4 x float> @bfmlalb_lane_7_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalb z0.s, z1.h, z2.h[7]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 7)
   ret <vscale x 4 x float> %out
 }
 
@@ -153,7 +153,7 @@ define <vscale x 4 x float> @bfmlalt_lane_0_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[0]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 0)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 0)
   ret <vscale x 4 x float> %out
 }
 
@@ -162,7 +162,7 @@ define <vscale x 4 x float> @bfmlalt_lane_1_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[1]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 1)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 1)
   ret <vscale x 4 x float> %out
 }
 
@@ -171,7 +171,7 @@ define <vscale x 4 x float> @bfmlalt_lane_2_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[2]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 2)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 2)
   ret <vscale x 4 x float> %out
 }
 
@@ -180,7 +180,7 @@ define <vscale x 4 x float> @bfmlalt_lane_3_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[3]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 3)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 3)
   ret <vscale x 4 x float> %out
 }
 
@@ -189,7 +189,7 @@ define <vscale x 4 x float> @bfmlalt_lane_4_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[4]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 4)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 4)
   ret <vscale x 4 x float> %out
 }
 
@@ -198,7 +198,7 @@ define <vscale x 4 x float> @bfmlalt_lane_5_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[5]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 5)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 5)
   ret <vscale x 4 x float> %out
 }
 
@@ -207,7 +207,7 @@ define <vscale x 4 x float> @bfmlalt_lane_6_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[6]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 6)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 6)
   ret <vscale x 4 x float> %out
 }
 
@@ -216,7 +216,7 @@ define <vscale x 4 x float> @bfmlalt_lane_7_f32(<vscale x 4 x float> %a, <vscale
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    bfmlalt z0.s, z1.h, z2.h[7]
 ; CHECK-NEXT:    ret
-  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i64 7)
+  %out = call <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float> %a, <vscale x 8 x bfloat> %b, <vscale x 8 x bfloat> %c, i32 7)
   ret <vscale x 4 x float> %out
 }
 
@@ -260,11 +260,11 @@ define <vscale x 8 x bfloat> @fcvtnt_bf16_f32(<vscale x 8 x bfloat> %a, <vscale
 }
 
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfdot.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalb.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
-declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
+declare <vscale x 4 x float> @llvm.aarch64.sve.bfmlalt.lane.v2(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
 declare <vscale x 4 x float> @llvm.aarch64.sve.bfmmla(<vscale x 4 x float>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.fcvtnt.bf16f32(<vscale x 8 x bfloat>, <vscale x 8 x i1>, <vscale x 4 x float>)