[Hexagon] Improve QFP Optimizer (#166647)

This patch enhances HexagonQFPOptimizer in multiple ways: 1. Refactor the code for better readability and maintainability. 2. Optimize vabs,vneg and vilog2 converts The three instruction mentioned can be optimized like below: ```v1.sf = v0.qf32 v2.qf = vneg v1.sf``` to ```v2.qf = vneg v0.qf32``` This optimization eliminates one conversion and is applicable to both qf32 and qf16 types. 3. Enable vsub fusion with mixed arguments Previously, QFPOptimizer did not fuse partial qfloat operands with vsub. This update allows selective use of vsub_hf_mix, vsub_sf_mix, vsub_qf16_mix, and vsub_qf32_mix when appropriate. It also enables QFP simplifications involving vector pair subregisters. Example scenario in a machine basic block targeting Hexagon: ```v1.qf32 = ... // result of a vadd v2.sf = v1.qf32 v3.qf32 = vmpy(v2.sf, v2.sf)``` 4. Remove redundant conversions Under certain conditions, we previously bailed out before removing qf-to-sf/hf conversions. This patch removes that bailout, enabling more aggressive elimination of unnecessary conversions. 5. Don't optimize equals feeding into multiply: Removing converts feeding into multiply loses precision. This patch avoids optimizing multiplies along with giving the users an option to enable this by a flag. Patch By: Fateme Hosseini Co-authored-by: Kaushik Kulkarni <quic_kauskulk@quicinc.com> Co-authored-by: Santanu Das <santdas@qti.qualcomm.com>
2025-11-06 14:02:15 -06:00 · 2025-11-06 14:02:15 -06:00 · 8d0df57340
commit 8d0df57340
parent 316236b1c0
8 changed files with 863 additions and 37 deletions
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@ -4753,6 +4753,19 @@ bool HexagonInstrInfo::getBundleNoShuf(const MachineInstr &MIB) const {
  return (Operand.isImm() && (Operand.getImm() & memShufDisabledMask) != 0);
 }

+bool HexagonInstrInfo::isQFPMul(const MachineInstr *MI) const {
+  return (MI->getOpcode() == Hexagon::V6_vmpy_qf16_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_sf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_mix_hf ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32_qf16 ||
+          MI->getOpcode() == Hexagon::V6_vmpy_qf32);
+}
+
 // Addressing mode relations.
 short HexagonInstrInfo::changeAddrMode_abs_io(short Opc) const {
  return Opc >= 0 ? Hexagon::changeAddrMode_abs_io(Opc) : Opc;
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@ -532,6 +532,7 @@ public:
  }

  MCInst getNop() const override;
+  bool isQFPMul(const MachineInstr *MF) const;
 };

 /// \brief Create RegSubRegPair from a register MachineOperand
--- a/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonQFPOptimizer.cpp
@ -58,7 +58,7 @@
 // are PHI inst.
 //
 //===----------------------------------------------------------------------===//
-#include <unordered_set>
+
 #define HEXAGON_QFP_OPTIMIZER "QFP optimizer pass"

 #include "Hexagon.h"
@ -86,6 +86,9 @@ using namespace llvm;
 cl::opt<bool>
    DisableQFOptimizer("disable-qfp-opt", cl::init(false),
                       cl::desc("Disable optimization of Qfloat operations."));
+cl::opt<bool> DisableQFOptForMul(
+    "disable-qfp-opt-mul", cl::init(true),
+    cl::desc("Disable optimization of Qfloat operations for multiply."));

 namespace {
 const std::map<unsigned short, unsigned short> QFPInstMap{
@ -101,11 +104,21 @@ const std::map<unsigned short, unsigned short> QFPInstMap{
    {Hexagon::V6_vmpy_qf16_mix_hf, Hexagon::V6_vmpy_qf16},
    {Hexagon::V6_vmpy_qf32_hf, Hexagon::V6_vmpy_qf32_mix_hf},
    {Hexagon::V6_vmpy_qf32_mix_hf, Hexagon::V6_vmpy_qf32_qf16},
-    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32}};
+    {Hexagon::V6_vmpy_qf32_sf, Hexagon::V6_vmpy_qf32},
+    {Hexagon::V6_vilog2_sf, Hexagon::V6_vilog2_qf32},
+    {Hexagon::V6_vilog2_hf, Hexagon::V6_vilog2_qf16},
+    {Hexagon::V6_vabs_qf32_sf, Hexagon::V6_vabs_qf32_qf32},
+    {Hexagon::V6_vabs_qf16_hf, Hexagon::V6_vabs_qf16_qf16},
+    {Hexagon::V6_vneg_qf32_sf, Hexagon::V6_vneg_qf32_qf32},
+    {Hexagon::V6_vneg_qf16_hf, Hexagon::V6_vneg_qf16_qf16}};
 } // namespace

-namespace {
+namespace llvm {
+FunctionPass *createHexagonQFPOptimizer();
+void initializeHexagonQFPOptimizerPass(PassRegistry &);
+} // namespace llvm

+namespace {
 struct HexagonQFPOptimizer : public MachineFunctionPass {
 public:
  static char ID;
@ -116,6 +129,10 @@ public:

  bool optimizeQfp(MachineInstr *MI, MachineBasicBlock *MBB);

+  bool optimizeQfpTwoOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
+  bool optimizeQfpOneOp(MachineInstr *MI, MachineBasicBlock *MBB);
+
  StringRef getPassName() const override { return HEXAGON_QFP_OPTIMIZER; }

  void getAnalysisUsage(AnalysisUsage &AU) const override {
@ -142,19 +159,69 @@ FunctionPass *llvm::createHexagonQFPOptimizer() {
 bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
                                      MachineBasicBlock *MBB) {

-  // Early exit:
-  // - if instruction is invalid or has too few operands (QFP ops need 2 sources
-  // + 1 dest),
-  // - or does not have a transformation mapping.
-  if (MI->getNumOperands() < 3)
+  if (MI->getNumOperands() == 2)
+    return optimizeQfpOneOp(MI, MBB);
+  else if (MI->getNumOperands() == 3)
+    return optimizeQfpTwoOp(MI, MBB);
+  else
    return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpOneOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+
+  unsigned Op0F = 0;
+  auto It = QFPInstMap.find(MI->getOpcode());
+  if (It == QFPInstMap.end())
+    return false;
+
+  unsigned short InstTy = It->second;
+  // Get the reachind defs of MI
+  MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(1).getReg());
+  MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI->dump());
+  MachineInstr *ReachDefDef = nullptr;
+
+  // Get the reaching def of the reaching def to check for W reg def
+  if (DefMI->getNumOperands() > 1 && DefMI->getOperand(1).isReg() &&
+      DefMI->getOperand(1).getReg().isVirtual())
+    ReachDefDef = MRI->getVRegDef(DefMI->getOperand(1).getReg());
+  unsigned ReachDefOp = DefMI->getOpcode();
+  MachineInstrBuilder MIB;
+
+  // Check if the reaching def is a conversion
+  if (ReachDefOp == Hexagon::V6_vconv_sf_qf32 ||
+      ReachDefOp == Hexagon::V6_vconv_hf_qf16) {
+
+    // Return if the reaching def of reaching def is W type
+    if (ReachDefDef && MRI->getRegClass(ReachDefDef->getOperand(0).getReg()) ==
+                           &Hexagon::HvxWRRegClass)
+      return false;
+
+    // Analyze the use operands of the conversion to get their KILL status
+    MachineOperand &SrcOp = DefMI->getOperand(1);
+    Op0F = getKillRegState(SrcOp.isKill());
+    SrcOp.setIsKill(false);
+    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+              .addReg(SrcOp.getReg(), Op0F, SrcOp.getSubReg());
+    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
+    return true;
+  }
+  return false;
+}
+
+bool HexagonQFPOptimizer::optimizeQfpTwoOp(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+
+  unsigned Op0F = 0;
+  unsigned Op1F = 0;
  auto It = QFPInstMap.find(MI->getOpcode());
  if (It == QFPInstMap.end())
    return false;
  unsigned short InstTy = It->second;
-
-  unsigned Op0F = 0;
-  unsigned Op1F = 0;
  // Get the reaching defs of MI, DefMI1 and DefMI2
  MachineInstr *DefMI1 = nullptr;
  MachineInstr *DefMI2 = nullptr;
@ -167,6 +234,9 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
    return false;

  MachineOperand &Res = MI->getOperand(0);
+  if (!Res.isReg())
+    return false;
+
  MachineInstr *Inst1 = nullptr;
  MachineInstr *Inst2 = nullptr;
  LLVM_DEBUG(dbgs() << "\n[Reaching Defs of operands]: "; DefMI1->dump();
@ -185,7 +255,8 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
  unsigned Def2OP = DefMI2->getOpcode();

  MachineInstrBuilder MIB;
-  // Case 1: Both reaching defs of MI are qf to sf/hf conversions
+
+  // Check if the both the reaching defs of MI are qf to sf/hf conversions
  if ((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
       Def2OP == Hexagon::V6_vconv_sf_qf32) ||
      (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@ -226,7 +297,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
    return true;

-    // Case 2: Left operand is conversion to sf/hf
+    // Check if left operand's reaching def is a conversion to sf/hf
  } else if (((Def1OP == Hexagon::V6_vconv_sf_qf32 &&
               Def2OP != Hexagon::V6_vconv_sf_qf32) ||
              (Def1OP == Hexagon::V6_vconv_hf_qf16 &&
@ -250,7 +321,7 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
    return true;

-    // Case 2: Left operand is conversion to sf/hf
+    // Check if right operand's reaching def is a conversion to sf/hf
  } else if (((Def1OP != Hexagon::V6_vconv_sf_qf32 &&
               Def2OP == Hexagon::V6_vconv_sf_qf32) ||
              (Def1OP != Hexagon::V6_vconv_hf_qf16 &&
@ -258,13 +329,6 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
             !DefMI1->isPHI() &&
             (MI->getOpcode() != Hexagon::V6_vmpy_qf32_sf)) {
    // The second operand of original instruction is converted.
-    // In "mix" instructions, "qf" operand is always the first operand.
-
-    // Caveat: vsub is not commutative w.r.t operands.
-    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
-        InstTy == Hexagon::V6_vsub_qf32_mix)
-      return false;
-
    if (Inst2 && MRI->getRegClass(Inst2->getOperand(0).getReg()) ==
                     &Hexagon::HvxWRRegClass)
      return false;
@ -275,10 +339,26 @@ bool HexagonQFPOptimizer::optimizeQfp(MachineInstr *MI,
    Op1F = getKillRegState(Src2.isKill());
    Src2.setIsKill(false);
    Op0F = getKillRegState(Src1.isKill());
-    MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
-              .addReg(Src2.getReg(), Op1F,
-                      Src2.getSubReg()) // Notice the operands are flipped.
-              .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    if (InstTy == Hexagon::V6_vsub_qf16_mix ||
+        InstTy == Hexagon::V6_vsub_qf32_mix) {
+      if (!HST->useHVXV81Ops())
+        // vsub_(hf|sf)_mix insts are only avlbl on hvx81+
+        return false;
+      // vsub is not commutative w.r.t. operands -> treat it as a special case
+      // to choose the correct mix instruction.
+      if (Def2OP == Hexagon::V6_vconv_sf_qf32)
+        InstTy = Hexagon::V6_vsub_sf_mix;
+      else if (Def2OP == Hexagon::V6_vconv_hf_qf16)
+        InstTy = Hexagon::V6_vsub_hf_mix;
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg())
+                .addReg(Src2.getReg(), Op1F, Src2.getSubReg());
+    } else {
+      MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), HII->get(InstTy), Res.getReg())
+                .addReg(Src2.getReg(), Op1F,
+                        Src2.getSubReg()) // Notice the operands are flipped.
+                .addReg(Src1.getReg(), Op0F, Src1.getSubReg());
+    }
    LLVM_DEBUG(dbgs() << "\n[Inserting]: "; MIB.getInstr()->dump());
    return true;
  }
@ -309,15 +389,18 @@ bool HexagonQFPOptimizer::runOnMachineFunction(MachineFunction &MF) {
    while (MII != MBBI->instr_end()) {
      MachineInstr *MI = &*MII;
      ++MII; // As MI might be removed.
-
-      if (QFPInstMap.count(MI->getOpcode()) &&
-          MI->getOpcode() != Hexagon::V6_vconv_sf_qf32 &&
-          MI->getOpcode() != Hexagon::V6_vconv_hf_qf16) {
-        LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
-        if (optimizeQfp(MI, MBB)) {
-          MI->eraseFromParent();
-          LLVM_DEBUG(dbgs() << "\t....Removing....");
-          Changed = true;
+      if (QFPInstMap.count(MI->getOpcode())) {
+        auto OpC = MI->getOpcode();
+        if (DisableQFOptForMul && HII->isQFPMul(MI))
+          continue;
+        if (OpC != Hexagon::V6_vconv_sf_qf32 &&
+            OpC != Hexagon::V6_vconv_hf_qf16) {
+          LLVM_DEBUG(dbgs() << "\n###Analyzing for removal: "; MI->dump());
+          if (optimizeQfp(MI, MBB)) {
+            MI->eraseFromParent();
+            LLVM_DEBUG(dbgs() << "\t....Removing....");
+            Changed = true;
+          }
        }
      }
    }
--- a/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/xqf-fixup-qfp1.ll
@ -0,0 +1,372 @@
+; REQUIRES: hexagon-registered-target, silver
+; This tests correct handling of register spills and fills of
+; qf operands during register allocation.
+
+; RUN: llc -mcpu=hexagonv79 -mattr=+hvx-length128b,+hvxv79,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V79
+; RUN: llc -mcpu=hexagonv81 -mattr=+hvx-length128b,+hvxv81,+hvx-ieee-fp,+hvx-qfloat,-long-calls -debug-only=handle-qfp %s 2>&1 -o - | FileCheck %s --check-prefixes V79-81,V81
+
+; V79-81: Finding uses of:   renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81: Inserting after conv:   [[VREG0:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG0]]
+; V79-81-NEXT: Inserting after conv:   [[VREG1:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG1]]
+; V79-81: Finding uses of:   renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81: Inserting after conv:   [[VREG2:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG2]]
+; V79-81-NEXT: Inserting after conv:   [[VREG3:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG3]]
+; V79-81: Finding uses of:   renamable $w{{[0-9]+}} = V6_vmpy_qf32_hf
+; V79-81-DAG: Inserting after conv:   [[VREG4:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG4]]
+; V79-81-DAG: Inserting after conv:   [[VREG5:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable [[VREG5]]
+; V79-81-DAG: Inserting new instruction:   $v{{[0-9]+}} = V6_vadd_sf killed renamable [[VREG2]], killed renamable [[VREG0]]
+; V79-81-DAG: Inserting new instruction:   $v{{[0-9]+}} = V6_vsub_sf killed renamable $v{{[0-9]+}}, killed renamable $v{{[0-9]+}}
+;
+; V79-81: Analyzing convert instruction:   renamable [[VREG6:\$v[0-9]+]] = V6_vconv_hf_qf32 killed renamable $w{{[0-9]+}}
+; V79: Inserting new instruction:   [[VREG30:\$v[0-9]+]] = V6_vd0
+; V79-NEXT: Inserting new instruction:   [[VREG7:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG7]], killed [[VREG30]]
+; V79: Inserting new instruction:   [[VREG30]] = V6_vd0
+; V79-NEXT: Inserting new instruction:   [[VREG8:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG8]], killed [[VREG30]]
+; V81: Inserting new instruction:  [[VREG7:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG7]]
+; V81: Inserting new instruction:  [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]]
+
+; V79-81: Analyzing convert instruction:   renamable [[VREG9:\$v[0-9]+]] = V6_vconv_sf_qf32 killed renamable $v{{[0-9]+}}
+; V79: Inserting new instruction:   [[VREG30]] = V6_vd0
+; V79-NEXT: Inserting new instruction:   [[VREG10:\$v[0-9]+]] = V6_vadd_sf killed renamable [[VREG10]], killed [[VREG30]]
+; V81: Inserting new instruction:  [[VREG8:\$v[0-9]+]] = V6_vconv_qf32_sf killed renamable [[VREG8]]
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+@.str.1 = private unnamed_addr constant [9 x i8] c"0x%08lx \00", align 1
+@.str.3 = private unnamed_addr constant [173 x i8] c"/prj/qct/llvm/devops/aether/hexbuild/test_trees/MASTER/test/regress/features/hexagon/arch_v68/hvx_ieee_fp/hvx_ieee_fp_test.c:126 0 && \22ERROR: Failed to acquire HVX unit.\\n\22\00", align 1
+@__func__.main = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str.5 = private unnamed_addr constant [33 x i8] c"half -3 converted to vhf = %.2f\0A\00", align 1
+@.str.6 = private unnamed_addr constant [35 x i8] c"uhalf 32k converted to vhf = %.2f\0A\00", align 1
+@.str.7 = private unnamed_addr constant [32 x i8] c"sf 0.5 converted to vhf = %.2f\0A\00", align 1
+@.str.8 = private unnamed_addr constant [32 x i8] c"vhf 4.0 conveted to ubyte = %d\0A\00", align 1
+@.str.9 = private unnamed_addr constant [32 x i8] c"vhf 2.0 conveted to uhalf = %d\0A\00", align 1
+@.str.10 = private unnamed_addr constant [30 x i8] c"byte 4 conveted to hf = %.2f\0A\00", align 1
+@.str.11 = private unnamed_addr constant [31 x i8] c"ubyte 4 conveted to hf = %.2f\0A\00", align 1
+@.str.12 = private unnamed_addr constant [27 x i8] c"hf -3 conveted to sf = %f\0A\00", align 1
+@.str.13 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to byte = %d\0A\00", align 1
+@.str.14 = private unnamed_addr constant [31 x i8] c"vhf 4.0 conveted to half = %d\0A\00", align 1
+@.str.16 = private unnamed_addr constant [33 x i8] c"max of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1
+@.str.17 = private unnamed_addr constant [33 x i8] c"min of hf 2.0 and hf 4.0 = %.2f\0A\00", align 1
+@.str.18 = private unnamed_addr constant [32 x i8] c"max of sf 0.5 and sf 0.25 = %f\0A\00", align 1
+@.str.19 = private unnamed_addr constant [32 x i8] c"min of sf 0.5 and sf 0.25 = %f\0A\00", align 1
+@.str.21 = private unnamed_addr constant [25 x i8] c"negate of hf 4.0 = %.2f\0A\00", align 1
+@.str.22 = private unnamed_addr constant [23 x i8] c"abs of hf -6.0 = %.2f\0A\00", align 1
+@.str.23 = private unnamed_addr constant [23 x i8] c"negate of sf 0.5 = %f\0A\00", align 1
+@.str.24 = private unnamed_addr constant [22 x i8] c"abs of sf -0.25 = %f\0A\00", align 1
+@.str.26 = private unnamed_addr constant [32 x i8] c"hf add of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.27 = private unnamed_addr constant [32 x i8] c"hf sub of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.28 = private unnamed_addr constant [31 x i8] c"sf add of 0.5 and -0.25  = %f\0A\00", align 1
+@.str.29 = private unnamed_addr constant [31 x i8] c"sf sub of 0.5 and -0.25  = %f\0A\00", align 1
+@.str.30 = private unnamed_addr constant [36 x i8] c"sf add of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.31 = private unnamed_addr constant [36 x i8] c"sf sub of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.33 = private unnamed_addr constant [32 x i8] c"hf mpy of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.34 = private unnamed_addr constant [35 x i8] c"hf accmpy of 4.0 and -6.0  = %.2f\0A\00", align 1
+@.str.35 = private unnamed_addr constant [36 x i8] c"sf mpy of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.36 = private unnamed_addr constant [39 x i8] c"sf accmpy of hf 4.0 and hf -6.0  = %f\0A\00", align 1
+@.str.37 = private unnamed_addr constant [31 x i8] c"sf mpy of 0.5 and -0.25  = %f\0A\00", align 1
+@.str.39 = private unnamed_addr constant [25 x i8] c"w copy from sf 0.5 = %f\0A\00", align 1
+@str = private unnamed_addr constant [35 x i8] c"ERROR: Failed to acquire HVX unit.\00", align 1
+@str.40 = private unnamed_addr constant [25 x i8] c"\0AConversion intructions\0A\00", align 1
+@str.41 = private unnamed_addr constant [23 x i8] c"\0AMin/Max instructions\0A\00", align 1
+@str.42 = private unnamed_addr constant [23 x i8] c"\0Aabs/neg instructions\0A\00", align 1
+@str.43 = private unnamed_addr constant [23 x i8] c"\0Aadd/sub instructions\0A\00", align 1
+@str.44 = private unnamed_addr constant [24 x i8] c"\0Amultiply instructions\0A\00", align 1
+@str.45 = private unnamed_addr constant [19 x i8] c"\0Acopy instruction\0A\00", align 1
+
+declare dso_local void @print_vector_words(<32 x i32> noundef %x) local_unnamed_addr #0
+
+; Function Attrs: nofree nounwind optsize
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) local_unnamed_addr #0
+
+; Function Attrs: nounwind optsize
+define dso_local i32 @main(i32 noundef %argc, ptr nocapture noundef readnone %argv) local_unnamed_addr #1 {
+entry:
+  %call = tail call i32 @acquire_vector_unit(i8 noundef zeroext 0) #6
+  %tobool.not = icmp eq i32 %call, 0
+  br i1 %tobool.not, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %puts = tail call i32 @puts(ptr nonnull dereferenceable(1) @str)
+  tail call void @_Assert(ptr noundef nonnull @.str.3, ptr noundef nonnull @__func__.main) #7
+  unreachable
+
+if.end:                                           ; preds = %entry
+  tail call void @set_double_vector_mode() #6
+  %0 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 16384)
+  %1 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 17408)
+  %2 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -14848)
+  %3 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+  %4 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1048576000)
+  %5 = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 -1098907648)
+  %6 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 -3)
+  %7 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32768)
+  %puts147 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.40)
+  %8 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32> %6)
+  %bc.i = bitcast <32 x i32> %8 to <64 x half>
+  %9 = extractelement <64 x half> %bc.i, i64 0
+  %conv = fpext half %9 to double
+  %call12 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.5, double noundef %conv) #6
+  %10 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32> %7)
+  %bc.i153 = bitcast <32 x i32> %10 to <64 x half>
+  %11 = extractelement <64 x half> %bc.i153, i64 0
+  %conv14 = fpext half %11 to double
+  %call15 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.6, double noundef %conv14) #6
+  %12 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32> %3, <32 x i32> %3)
+  %bc.i155 = bitcast <32 x i32> %12 to <64 x half>
+  %13 = extractelement <64 x half> %bc.i155, i64 0
+  %conv17 = fpext half %13 to double
+  %call18 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.7, double noundef %conv17) #6
+  %14 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32> %1, <32 x i32> %1)
+  %15 = bitcast <32 x i32> %14 to <128 x i8>
+  %conv.i = extractelement <128 x i8> %15, i64 0
+  %conv20 = zext i8 %conv.i to i32
+  %call21 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.8, i32 noundef %conv20) #6
+  %16 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32> %0)
+  %17 = bitcast <32 x i32> %16 to <64 x i16>
+  %conv.i157 = extractelement <64 x i16> %17, i64 0
+  %conv23 = sext i16 %conv.i157 to i32
+  %call24 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.9, i32 noundef %conv23) #6
+  %18 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32> %14)
+  %bc.i158 = bitcast <64 x i32> %18 to <128 x half>
+  %19 = extractelement <128 x half> %bc.i158, i64 0
+  %conv26 = fpext half %19 to double
+  %call27 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.10, double noundef %conv26) #6
+  %20 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32> %14)
+  %bc.i159 = bitcast <64 x i32> %20 to <128 x half>
+  %21 = extractelement <128 x half> %bc.i159, i64 0
+  %conv29 = fpext half %21 to double
+  %call30 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.11, double noundef %conv29) #6
+  %22 = tail call <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32> %8)
+  %bc.i161 = bitcast <64 x i32> %22 to <64 x float>
+  %23 = extractelement <64 x float> %bc.i161, i64 0
+  %conv32 = fpext float %23 to double
+  %call33 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.12, double noundef %conv32) #6
+  %24 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32> %1, <32 x i32> %1)
+  %25 = bitcast <32 x i32> %24 to <128 x i8>
+  %conv.i162 = extractelement <128 x i8> %25, i64 0
+  %conv35 = zext i8 %conv.i162 to i32
+  %call36 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.13, i32 noundef %conv35) #6
+  %26 = tail call <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32> %1)
+  %27 = bitcast <32 x i32> %26 to <64 x i16>
+  %conv.i163 = extractelement <64 x i16> %27, i64 0
+  %conv38 = sext i16 %conv.i163 to i32
+  %call39 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.14, i32 noundef %conv38) #6
+  %28 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32> %0, <32 x i32> %1)
+  %puts148 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.41)
+  %bc.i164 = bitcast <32 x i32> %28 to <64 x half>
+  %29 = extractelement <64 x half> %bc.i164, i64 0
+  %conv42 = fpext half %29 to double
+  %call43 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.16, double noundef %conv42) #6
+  %30 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32> %0, <32 x i32> %1)
+  %bc.i166 = bitcast <32 x i32> %30 to <64 x half>
+  %31 = extractelement <64 x half> %bc.i166, i64 0
+  %conv45 = fpext half %31 to double
+  %call46 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.17, double noundef %conv45) #6
+  %32 = tail call <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32> %3, <32 x i32> %4)
+  %bc.i168 = bitcast <32 x i32> %32 to <32 x float>
+  %33 = extractelement <32 x float> %bc.i168, i64 0
+  %conv48 = fpext float %33 to double
+  %call49 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.18, double noundef %conv48) #6
+  %34 = tail call <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32> %3, <32 x i32> %4)
+  %bc.i169 = bitcast <32 x i32> %34 to <32 x float>
+  %35 = extractelement <32 x float> %bc.i169, i64 0
+  %conv51 = fpext float %35 to double
+  %call52 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.19, double noundef %conv51) #6
+  %puts149 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.42)
+  %36 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32> %1)
+  %bc.i170 = bitcast <32 x i32> %36 to <64 x half>
+  %37 = extractelement <64 x half> %bc.i170, i64 0
+  %conv55 = fpext half %37 to double
+  %call56 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.21, double noundef %conv55) #6
+  %38 = tail call <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32> %2)
+  %bc.i172 = bitcast <32 x i32> %38 to <64 x half>
+  %39 = extractelement <64 x half> %bc.i172, i64 0
+  %conv58 = fpext half %39 to double
+  %call59 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.22, double noundef %conv58) #6
+  %40 = tail call <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32> %3)
+  %bc.i174 = bitcast <32 x i32> %40 to <32 x float>
+  %41 = extractelement <32 x float> %bc.i174, i64 0
+  %conv61 = fpext float %41 to double
+  %call62 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.23, double noundef %conv61) #6
+  %42 = tail call <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32> %5)
+  %bc.i175 = bitcast <32 x i32> %42 to <32 x float>
+  %43 = extractelement <32 x float> %bc.i175, i64 0
+  %conv64 = fpext float %43 to double
+  %call65 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.24, double noundef %conv64) #6
+  %puts150 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.43)
+  %44 = tail call <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i176 = bitcast <32 x i32> %44 to <64 x half>
+  %45 = extractelement <64 x half> %bc.i176, i64 0
+  %conv68 = fpext half %45 to double
+  %call69 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.26, double noundef %conv68) #6
+  %46 = tail call <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i178 = bitcast <32 x i32> %46 to <64 x half>
+  %47 = extractelement <64 x half> %bc.i178, i64 0
+  %conv71 = fpext half %47 to double
+  %call72 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.27, double noundef %conv71) #6
+  %48 = tail call <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+  %bc.i180 = bitcast <32 x i32> %48 to <32 x float>
+  %49 = extractelement <32 x float> %bc.i180, i64 0
+  %conv74 = fpext float %49 to double
+  %call75 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.28, double noundef %conv74) #6
+  %50 = tail call <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+  %bc.i181 = bitcast <32 x i32> %50 to <32 x float>
+  %51 = extractelement <32 x float> %bc.i181, i64 0
+  %conv77 = fpext float %51 to double
+  %call78 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.29, double noundef %conv77) #6
+  %52 = tail call <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i182 = bitcast <64 x i32> %52 to <64 x float>
+  %53 = extractelement <64 x float> %bc.i182, i64 0
+  %conv80 = fpext float %53 to double
+  %call81 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.30, double noundef %conv80) #6
+  %54 = tail call <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i183 = bitcast <64 x i32> %54 to <64 x float>
+  %55 = extractelement <64 x float> %bc.i183, i64 0
+  %conv83 = fpext float %55 to double
+  %call84 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.31, double noundef %conv83) #6
+  %puts151 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.44)
+  %56 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i184 = bitcast <32 x i32> %56 to <64 x half>
+  %57 = extractelement <64 x half> %bc.i184, i64 0
+  %conv87 = fpext half %57 to double
+  %call88 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.33, double noundef %conv87) #6
+  %58 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32> %56, <32 x i32> %1, <32 x i32> %2)
+  %bc.i186 = bitcast <32 x i32> %58 to <64 x half>
+  %59 = extractelement <64 x half> %bc.i186, i64 0
+  %conv90 = fpext half %59 to double
+  %call91 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.34, double noundef %conv90) #6
+  %60 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32> %1, <32 x i32> %2)
+  %bc.i188 = bitcast <64 x i32> %60 to <64 x float>
+  %61 = extractelement <64 x float> %bc.i188, i64 0
+  %conv93 = fpext float %61 to double
+  %call94 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.35, double noundef %conv93) #6
+  %62 = tail call <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32> %60, <32 x i32> %1, <32 x i32> %2)
+  %bc.i189 = bitcast <64 x i32> %62 to <64 x float>
+  %63 = extractelement <64 x float> %bc.i189, i64 0
+  %conv96 = fpext float %63 to double
+  %call97 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.36, double noundef %conv96) #6
+  %64 = tail call <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32> %3, <32 x i32> %5)
+  %bc.i190 = bitcast <32 x i32> %64 to <32 x float>
+  %65 = extractelement <32 x float> %bc.i190, i64 0
+  %conv99 = fpext float %65 to double
+  %call100 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.37, double noundef %conv99) #6
+  %puts152 = tail call i32 @puts(ptr nonnull dereferenceable(1) @str.45)
+  %66 = tail call <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32> %3)
+  %bc.i191 = bitcast <32 x i32> %66 to <32 x float>
+  %67 = extractelement <32 x float> %bc.i191, i64 0
+  %conv103 = fpext float %67 to double
+  %call104 = tail call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str.39, double noundef %conv103) #6
+  ret i32 0
+}
+
+; Function Attrs: optsize
+declare dso_local i32 @acquire_vector_unit(i8 noundef zeroext) local_unnamed_addr #2
+
+; Function Attrs: noreturn nounwind optsize
+declare dso_local void @_Assert(ptr noundef, ptr noundef) local_unnamed_addr #3
+
+; Function Attrs: optsize
+declare dso_local void @set_double_vector_mode(...) local_unnamed_addr #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.h.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.uh.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.hf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.ub.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.uh.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.b.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.hf.ub.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vcvt.sf.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.b.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vcvt.h.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmax.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmin.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmax.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfmin.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfneg.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vabs.hf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vfneg.sf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vabs.sf.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vsub.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vadd.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vsub.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vadd.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vsub.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.hf.hf.acc.128B(<32 x i32>, <32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <64 x i32> @llvm.hexagon.V6.vmpy.sf.hf.acc.128B(<64 x i32>, <32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vmpy.sf.sf.128B(<32 x i32>, <32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.vassign.fp.128B(<32 x i32>) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #4
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none)
+declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32) #4
+
+; Function Attrs: nofree nounwind
+declare noundef i32 @putchar(i32 noundef) local_unnamed_addr #5
+
+; Function Attrs: nofree nounwind
+declare noundef i32 @puts(ptr nocapture noundef readonly) local_unnamed_addr #5
--- a/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll
+++ b/llvm/test/CodeGen/Hexagon/hvx-vsub-qf-sf-mix.ll
@ -0,0 +1,60 @@
+;; RUN: llc --mtriple=hexagon --mcpu=hexagonv81 --mattr=+hvxv81,+hvx-length128b %s -o - | FileCheck %s
+
+define void @mul_and_sub_1(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <32 x float>, ptr %A, align 4
+  %BVec = load <32 x float>, ptr %B, align 4
+  %CVec = load <32 x float>, ptr %C, align 4
+  %AtBVec = fmul <32 x float> %AVec, %BVec
+
+  %DVec = fsub <32 x float> %CVec, %AtBVec
+  store <32 x float> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_1
+;; CHECK: vsub(v{{[0-9]+}}.sf,v{{[0-9]+}}.qf32)
+
+
+define void @mul_and_sub_2(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <32 x float>, ptr %A, align 4
+  %BVec = load <32 x float>, ptr %B, align 4
+  %CVec = load <32 x float>, ptr %C, align 4
+  %AtBVec = fmul <32 x float> %AVec, %BVec
+
+  %DVec = fsub <32 x float> %AtBVec, %CVec
+  store <32 x float> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_2
+;; CHECK: vsub(v{{[0-9]+}}.qf32,v{{[0-9]+}}.sf)
+
+
+define void @mul_and_sub_3(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <64 x half>, ptr %A, align 4
+  %BVec = load <64 x half>, ptr %B, align 4
+  %CVec = load <64 x half>, ptr %C, align 4
+  %AtBVec = fmul <64 x half> %AVec, %BVec
+
+  %DVec = fsub <64 x half> %CVec, %AtBVec
+  store <64 x half> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_3
+;; CHECK: vsub(v{{[0-9]+}}.hf,v{{[0-9]+}}.qf16)
+
+
+define void @mul_and_sub_4(ptr readonly %A, ptr readonly %B, ptr readonly %C, ptr writeonly %D) {
+entry:
+  %AVec = load <64 x half>, ptr %A, align 4
+  %BVec = load <64 x half>, ptr %B, align 4
+  %CVec = load <64 x half>, ptr %C, align 4
+  %AtBVec = fmul <64 x half> %AVec, %BVec
+
+  %DVec = fsub <64 x half> %AtBVec, %CVec
+  store <64 x half> %DVec, ptr %D, align 4
+  ret void
+}
+;; CHECK: mul_and_sub_4
+;; CHECK: vsub(v{{[0-9]+}}.qf16,v{{[0-9]+}}.hf)
--- a/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
+++ b/llvm/test/CodeGen/Hexagon/qfpopt-rem-conv-add.ll
@ -2,7 +2,7 @@
 ; type as first parameter instead of a sf type without
 ; any conversion instruction of type sf = qf32

-; RUN: llc -mtriple=hexagon < %s -o - | FileCheck %s
+; RUN: llc -mtriple=hexagon -mattr=+hvx-length128b,+hvxv75,+v75 < %s -o - | FileCheck %s

 ; CHECK: [[V2:v[0-9]+]] = vxor([[V2]],[[V2]])
 ; CHECK: [[V0:v[0-9]+]].qf32 = vmpy([[V0]].sf,[[V2]].sf)
@ -17,5 +17,3 @@ entry:
  store <64 x half> %conv17.ripple.vectorized, ptr %out_ptr, align 2
  ret void
 }
-
-attributes #0 = { "target-features"="+hvx-length128b,+hvxv75,+v75,-long-calls,-small-data" }
--- a/llvm/test/CodeGen/Hexagon/vect-qfp.mir
+++ b/llvm/test/CodeGen/Hexagon/vect-qfp.mir
@ -0,0 +1,202 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer -disable-qfp-opt-mul=false %s -o - | FileCheck %s --check-prefix=MUL-ENABLED
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s --check-prefix=DEFAULT
+# MUL-ENABLED-LABEL: name: qfpAdd32
+# MUL-ENABLED: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vadd_qf32_mix
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vadd_qf32
+# DEFAULT-LABEL: name: qfpAdd32
+# DEFAULT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vadd_qf32_mix
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vadd_qf32
+---
+name: qfpAdd32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vadd_sf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vadd_sf %5:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vadd_sf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpAdd16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vadd_qf16_mix
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vadd_qf16
+# DEFAULT-LABEL: name: qfpAdd16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vadd_qf16_mix
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vadd_qf16
+---
+name: qfpAdd16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vadd_hf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+    %8:hvxvr = V6_vadd_hf %5:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vadd_hf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpSub32
+# MUL-ENABLED: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vsub_qf32_mix
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vsub_qf32
+# DEFAULT-LABEL: name: qfpSub32
+# DEFAULT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vsub_qf32_mix
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vsub_qf32
+---
+name: qfpSub32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vsub_sf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_sf_qf32 %6:hvxvr
+    %8:hvxvr = V6_vsub_sf %7:hvxvr, %5:hvxvr
+    %9:hvxvr = V6_vconv_sf_qf32 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vsub_sf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_sf_qf32 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpSub16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vsub_qf16_mix
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vsub_qf16
+# DEFAULT-LABEL: name: qfpSub16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vsub_qf16_mix
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vsub_qf16
+---
+name: qfpSub16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vsub_hf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+    %8:hvxvr = V6_vsub_hf %7:hvxvr, %5:hvxvr
+    %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vsub_hf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpMul32
+# MUL-ENABLED: V6_vmpy_qf32_sf
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vmpy_qf32_sf
+# MUL-ENABLED-NEXT: V6_vconv_sf_qf32
+# MUL-ENABLED-NEXT: V6_vmpy_qf32
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# DEFAULT-LABEL: name: qfpMul32
+# DEFAULT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vconv_sf_qf32
+# DEFAULT-NEXT: V6_vmpy_qf32_sf
+# DEFAULT-NEXT: V6_vS32Ub_ai
+---
+name: qfpMul32
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vL32Ub_ai %2:intregs, 0
+    %7:hvxvr = V6_vmpy_qf32_sf %4:hvxvr, %5:hvxvr
+    %8:hvxvr = V6_vconv_sf_qf32 %7:hvxvr
+    %9:hvxvr = V6_vmpy_qf32_sf %5:hvxvr, %6:hvxvr
+    %10:hvxvr = V6_vconv_sf_qf32 %9:hvxvr
+    %11:hvxvr = V6_vmpy_qf32_sf %8:hvxvr, %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
+...
+# MUL-ENABLED-LABEL: name: qfpMul16
+# MUL-ENABLED: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vmpy_qf16_mix_hf
+# MUL-ENABLED-NEXT: V6_vconv_hf_qf16
+# MUL-ENABLED-NEXT: V6_vS32Ub_ai
+# MUL-ENABLED-NEXT: V6_vmpy_qf16
+# DEFAULT-LABEL: name: qfpMul16
+# DEFAULT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vmpy_qf16_hf
+# DEFAULT-NEXT: V6_vconv_hf_qf16
+# DEFAULT-NEXT: V6_vS32Ub_ai
+# DEFAULT-NEXT: V6_vmpy_qf16_hf
+---
+name: qfpMul16
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    %0:intregs = COPY $r0
+    %1:intregs = COPY $r1
+    %2:intregs = COPY $r2
+    %3:intregs = COPY $r3
+    %4:hvxvr = V6_vL32Ub_ai %0:intregs, 0
+    %5:hvxvr = V6_vL32Ub_ai %1:intregs, 0
+    %6:hvxvr = V6_vmpy_qf16_hf %4:hvxvr, %5:hvxvr
+    %7:hvxvr = V6_vconv_hf_qf16 %6:hvxvr
+    %8:hvxvr = V6_vmpy_qf16_hf %5:hvxvr, %7:hvxvr
+    %9:hvxvr = V6_vconv_hf_qf16 %8:hvxvr
+    V6_vS32Ub_ai %2:intregs, 0, %9:hvxvr
+    %10:hvxvr = V6_vmpy_qf16_hf %7:hvxvr, %9:hvxvr
+    %11:hvxvr = V6_vconv_hf_qf16 %10:hvxvr
+    V6_vS32Ub_ai %3:intregs, 0, %11:hvxvr
--- a/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir
+++ b/llvm/test/CodeGen/Hexagon/vect/vect-qfp-unary.mir
@ -0,0 +1,97 @@
+# RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128b \
+# RUN: -run-pass hexagon-qfp-optimizer %s -o - | FileCheck %s
+
+
+# CHECK: name: qfp_vilog32
+# CHECK: V6_vilog2_qf32
+---
+name: qfp_vilog32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_sf_qf32 $v0
+    $v2 = V6_vilog2_sf $v1
+    V6_vS32Ub_ai $r2, 0, $v2
+...
+
+# CHECK-LABEL: name: qfp_vilog16
+# CHECK: V6_vilog2_qf16
+---
+name: qfp_vilog16
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_hf_qf16 $v0
+    $v2 = V6_vilog2_hf $v1
+    V6_vS32Ub_ai $r2, 0, $v2
+...
+
+# CHECK: name: qfp_vneg32
+# CHECK: V6_vneg_qf32_qf32
+---
+name: qfp_vneg32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_sf_qf32 $v0
+    $v2 = V6_vneg_qf32_sf $v1
+    $v3 = V6_vconv_sf_qf32 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK-LABEL: name: qfp_vneg16
+# CHECK: V6_vneg_qf16_qf16
+---
+name: qfp_vneg16
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_hf_qf16 $v0
+    $v2 = V6_vneg_qf16_hf $v1
+    $v3 = V6_vconv_hf_qf16 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK: name: qfp_vabs32
+# CHECK: V6_vabs_qf32_qf32
+---
+name: qfp_vabs32
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_sf_qf32 $v0
+    $v2 = V6_vabs_qf32_sf $v1
+    $v3 = V6_vconv_sf_qf32 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...
+
+# CHECK-LABEL: name: qfp_vabs16
+# CHECK: V6_vabs_qf16_qf16
+---
+name: qfp_vabs16
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $r0, $r1, $r2, $r3
+    $v0 = V6_vL32Ub_ai $r0, 0
+    $v1 = V6_vconv_hf_qf16 $v0
+    $v2 = V6_vabs_qf16_hf $v1
+    $v3 = V6_vconv_hf_qf16 $v2
+    V6_vS32Ub_ai $r2, 0, $v3
+...