[AMDGPU] canCreateUndefOrPoisonForTargetNode - BFE_I32/U32 can't create poison/undef (#154932 )

Add AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode handler and tag BFE_I32/U32 nodes as they can only propagate poison, not create poison/undef. Fighting some of the remaining regressions in #152107
[mlir][scf] Quick fix to scf.execute_region no_inline (#154931 )
2025-08-22 12:14:45 +00:00 · 2025-08-22 13:11:27 +01:00 · 2025-08-22 13:56:12 +02:00 · 2025-08-22 06:52:13 -05:00 · 2025-08-22 19:46:33 +08:00 · 2025-08-22 12:20:50 +01:00
166 changed files with 10126 additions and 731 deletions
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@ -138,6 +138,12 @@
  Dump function CFGs to graphviz format after each stage;enable '-print-loops'
  for color-coded blocks

+- `--dump-dot-func=<func1,func2,func3...>`
+
+  Dump function CFGs to graphviz format for specified functions only;
+  takes function name patterns (regex supported). Note: C++ function names
+  must be passed using their mangled names
+
 - `--dump-linux-exceptions`

  Dump Linux kernel exception table
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@ -15,6 +15,12 @@

 #include "llvm/Support/CommandLine.h"

+namespace llvm {
+namespace bolt {
+class BinaryFunction;
+}
+} // namespace llvm
+
 namespace opts {

 enum HeatmapModeKind {
@ -100,6 +106,9 @@ extern llvm::cl::opt<unsigned> Verbosity;
 /// Return true if we should process all functions in the binary.
 bool processAllFunctions();

+/// Return true if we should dump dot graphs for the given function.
+bool shouldDumpDot(const llvm::bolt::BinaryFunction &Function);
+
 enum GadgetScannerKind { GS_PACRET, GS_PAUTH, GS_ALL };

 extern llvm::cl::bits<GadgetScannerKind> GadgetScannersToRun;
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@ -52,6 +52,7 @@ namespace opts {
 extern cl::opt<bool> PrintAll;
 extern cl::opt<bool> PrintDynoStats;
 extern cl::opt<bool> DumpDotAll;
+extern bool shouldDumpDot(const bolt::BinaryFunction &Function);
 extern cl::opt<std::string> AsmDump;
 extern cl::opt<bolt::PLTCall::OptType> PLT;
 extern cl::opt<bolt::IdenticalCodeFolding::ICFLevel, false,
@ -340,7 +341,7 @@ Error BinaryFunctionPassManager::runPasses() {

      Function.print(BC.outs(), Message);

-      if (opts::DumpDotAll)
+      if (opts::shouldDumpDot(Function))
        Function.dumpGraphForPass(PassIdName);
    }
  }
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@ -115,6 +115,35 @@ cl::opt<bool> DumpDotAll(
             "enable '-print-loops' for color-coded blocks"),
    cl::Hidden, cl::cat(BoltCategory));

+cl::list<std::string> DumpDotFunc(
+    "dump-dot-func", cl::CommaSeparated,
+    cl::desc(
+        "dump function CFGs to graphviz format for specified functions only;"
+        "takes function name patterns (regex supported)"),
+    cl::value_desc("func1,func2,func3,..."), cl::Hidden, cl::cat(BoltCategory));
+
+bool shouldDumpDot(const bolt::BinaryFunction &Function) {
+  // If dump-dot-all is enabled, dump all functions
+  if (DumpDotAll)
+    return !Function.isIgnored();
+
+  // If no specific functions specified in dump-dot-func, don't dump any
+  if (DumpDotFunc.empty())
+    return false;
+
+  if (Function.isIgnored())
+    return false;
+
+  // Check if function matches any of the specified patterns
+  for (const std::string &Name : DumpDotFunc) {
+    if (Function.hasNameRegex(Name)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static cl::list<std::string>
 ForceFunctionNames("funcs",
  cl::CommaSeparated,
@ -3569,7 +3598,7 @@ void RewriteInstance::postProcessFunctions() {
    if (opts::PrintAll || opts::PrintCFG)
      Function.print(BC->outs(), "after building cfg");

-    if (opts::DumpDotAll)
+    if (opts::shouldDumpDot(Function))
      Function.dumpGraphForPass("00_build-cfg");

    if (opts::PrintLoopInfo) {
--- a/bolt/test/Inputs/multi-func.cpp
+++ b/bolt/test/Inputs/multi-func.cpp
@ -0,0 +1,24 @@
+#include <iostream>
+
+// Multiple functions to test selective dumping
+int add(int a, int b) { return a + b; }
+
+int multiply(int a, int b) { return a * b; }
+
+int main_helper() {
+  std::cout << "Helper function" << std::endl;
+  return 42;
+}
+
+int main_secondary() { return add(5, 3); }
+
+void other_function() { std::cout << "Other function" << std::endl; }
+
+int main() {
+  int result = add(10, 20);
+  result = multiply(result, 2);
+  main_helper();
+  main_secondary();
+  other_function();
+  return result;
+}
--- a/bolt/test/dump-dot-func.test
+++ b/bolt/test/dump-dot-func.test
@ -0,0 +1,52 @@
+# Test the --dump-dot-func option with multiple functions 
+# (includes tests for both mangled/unmangled names)
+
+RUN: %clang++ %p/Inputs/multi-func.cpp -o %t.exe -Wl,-q
+
+# Test 1: --dump-dot-func with specific function name (mangled)
+RUN: llvm-bolt %t.exe -o %t.bolt1 --dump-dot-func=_Z3addii -v=1 2>&1 | FileCheck %s --check-prefix=ADD
+
+# Test 2: --dump-dot-func with regex pattern (main.*)
+RUN: llvm-bolt %t.exe -o %t.bolt2 --dump-dot-func="main.*" -v=1 2>&1 | FileCheck %s --check-prefix=MAIN-REGEX
+
+# Test 3: --dump-dot-func with multiple specific functions (mangled names)
+RUN: llvm-bolt %t.exe -o %t.bolt3 --dump-dot-func=_Z3addii,_Z8multiplyii -v=1 2>&1 | FileCheck %s --check-prefix=MULTI
+
+# Test 4: No option specified should create no dot files
+RUN: llvm-bolt %t.exe -o %t.bolt4 2>&1 | FileCheck %s --check-prefix=NONE
+
+# Test 5: --dump-dot-func with non-existent function
+RUN: llvm-bolt %t.exe -o %t.bolt5 --dump-dot-func=nonexistent -v=1 2>&1 | FileCheck %s --check-prefix=NONEXISTENT
+
+# Test 6: Backward compatibility - --dump-dot-all should still work
+RUN: llvm-bolt %t.exe -o %t.bolt6 --dump-dot-all -v=1 2>&1 | FileCheck %s --check-prefix=ALL
+
+# Test 7: Test with unmangled function name (main function)
+RUN: llvm-bolt %t.exe -o %t.bolt7 --dump-dot-func=main -v=1 2>&1 | FileCheck %s --check-prefix=MAIN-UNMANGLED
+
+# Check that specific functions are dumped
+ADD: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
+ADD-NOT: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
+ADD-NOT: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
+ADD-NOT: BOLT-INFO: dumping CFG to _Z11main_helperv-00_build-cfg.dot
+
+MAIN-REGEX-DAG: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
+MAIN-REGEX-NOT: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
+MAIN-REGEX-NOT: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
+
+MULTI-DAG: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
+MULTI-DAG: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
+MULTI-NOT: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
+MULTI-NOT: BOLT-INFO: dumping CFG to _Z11main_helperv-00_build-cfg.dot
+
+# Should be no dumping messages when no option is specified
+NONE-NOT: BOLT-INFO: dumping CFG
+
+# Should be no dumping messages for non-existent function
+NONEXISTENT-NOT: BOLT-INFO: dumping CFG
+
+ALL: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
+
+MAIN-UNMANGLED: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
+MAIN-UNMANGLED-NOT: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
+MAIN-UNMANGLED-NOT: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
@ -15,14 +15,12 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::bugprone {

-namespace {
-
 // Determine if the result of an expression is "stored" in some way.
 // It is true if the value is stored into a variable or used as initialization
 // or passed to a function or constructor.
 // For this use case compound assignments are not counted as a "store" (the 'E'
 // expression should have pointer type).
-bool isExprValueStored(const Expr *E, ASTContext &C) {
+static bool isExprValueStored(const Expr *E, ASTContext &C) {
  E = E->IgnoreParenCasts();
  // Get first non-paren, non-cast parent.
  ParentMapContext &PMap = C.getParentMapContext();
@ -49,6 +47,8 @@ bool isExprValueStored(const Expr *E, ASTContext &C) {
  return isa<CallExpr, CXXConstructExpr>(ParentE);
 }

+namespace {
+
 AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
              ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
  for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) {
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
@ -14,10 +14,8 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::bugprone {

-namespace {
-
-bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
-                                     const StringLiteral *Lit) {
+static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
+                                            const StringLiteral *Lit) {
  // String literals surrounded by parentheses are assumed to be on purpose.
  //    i.e.:  const char* Array[] = { ("a" "b" "c"), "d", [...] };

@ -58,6 +56,8 @@ bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
  return false;
 }

+namespace {
+
 AST_MATCHER_P(StringLiteral, isConcatenatedLiteral, unsigned,
              MaxConcatenatedTokens) {
  return Node.getNumConcatenated() > 1 &&
--- a/clang-tools-extra/clang-tidy/cert/StrToNumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/StrToNumCheck.cpp
@ -46,7 +46,9 @@ enum class ConversionKind {
  ToLongDouble
 };

-ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
+} // namespace
+
+static ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
  return llvm::StringSwitch<ConversionKind>(FD->getName())
      .Cases("atoi", "atol", ConversionKind::ToInt)
      .Case("atoll", ConversionKind::ToLongInt)
@ -54,8 +56,8 @@ ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
      .Default(ConversionKind::None);
 }

-ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
-                                    const TargetInfo &TI) {
+static ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
+                                           const TargetInfo &TI) {
  // Scan the format string for the first problematic format specifier, then
  // report that as the conversion type. This will miss additional conversion
  // specifiers, but that is acceptable behavior.
@ -128,7 +130,7 @@ ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
  return H.get();
 }

-StringRef classifyConversionType(ConversionKind K) {
+static StringRef classifyConversionType(ConversionKind K) {
  switch (K) {
  case ConversionKind::None:
    llvm_unreachable("Unexpected conversion kind");
@ -148,7 +150,7 @@ StringRef classifyConversionType(ConversionKind K) {
  llvm_unreachable("Unknown conversion kind");
 }

-StringRef classifyReplacement(ConversionKind K) {
+static StringRef classifyReplacement(ConversionKind K) {
  switch (K) {
  case ConversionKind::None:
    llvm_unreachable("Unexpected conversion kind");
@ -173,7 +175,6 @@ StringRef classifyReplacement(ConversionKind K) {
  }
  llvm_unreachable("Unknown conversion kind");
 }
-} // unnamed namespace

 void StrToNumCheck::check(const MatchFinder::MatchResult &Result) {
  const auto *Call = Result.Nodes.getNodeAs<CallExpr>("expr");
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@ -59,7 +59,9 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) {
  return true;
 }

-OverloadedOperatorKind getCorrespondingOverload(const FunctionDecl *FD) {
+} // namespace
+
+static OverloadedOperatorKind getCorrespondingOverload(const FunctionDecl *FD) {
  switch (FD->getOverloadedOperator()) {
  default:
    break;
@ -75,7 +77,7 @@ OverloadedOperatorKind getCorrespondingOverload(const FunctionDecl *FD) {
  llvm_unreachable("Not an overloaded allocation operator");
 }

-const char *getOperatorName(OverloadedOperatorKind K) {
+static const char *getOperatorName(OverloadedOperatorKind K) {
  switch (K) {
  default:
    break;
@ -91,13 +93,14 @@ const char *getOperatorName(OverloadedOperatorKind K) {
  llvm_unreachable("Not an overloaded allocation operator");
 }

-bool areCorrespondingOverloads(const FunctionDecl *LHS,
-                               const FunctionDecl *RHS) {
+static bool areCorrespondingOverloads(const FunctionDecl *LHS,
+                                      const FunctionDecl *RHS) {
  return RHS->getOverloadedOperator() == getCorrespondingOverload(LHS);
 }

-bool hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
-                                         const CXXRecordDecl *RD = nullptr) {
+static bool
+hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
+                                    const CXXRecordDecl *RD = nullptr) {
  if (RD) {
    // Check the methods in the given class and accessible to derived classes.
    for (const auto *BMD : RD->methods())
@ -124,8 +127,6 @@ bool hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
  return false;
 }

-} // anonymous namespace
-
 void NewDeleteOverloadsCheck::registerMatchers(MatchFinder *Finder) {
  // Match all operator new and operator delete overloads (including the array
  // forms). Do not match implicit operators, placement operators, or
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@ -395,16 +395,12 @@ void MacroToEnumCallbacks::Endif(SourceLocation Loc, SourceLocation IfLoc) {
  --CurrentFile->ConditionScopes;
 }

-namespace {
-
 template <size_t N>
-bool textEquals(const char (&Needle)[N], const char *HayStack) {
+static bool textEquals(const char (&Needle)[N], const char *HayStack) {
  return StringRef{HayStack, N - 1} == Needle;
 }

-template <size_t N> size_t len(const char (&)[N]) { return N - 1; }
-
-} // namespace
+template <size_t N> static size_t len(const char (&)[N]) { return N - 1; }

 void MacroToEnumCallbacks::PragmaDirective(SourceLocation Loc,
                                           PragmaIntroducerKind Introducer) {
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@ -16,14 +16,13 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::modernize {

-namespace {
+static constexpr char ConstructorCall[] = "constructorCall";
+static constexpr char ResetCall[] = "resetCall";
+static constexpr char NewExpression[] = "newExpression";

-constexpr char ConstructorCall[] = "constructorCall";
-constexpr char ResetCall[] = "resetCall";
-constexpr char NewExpression[] = "newExpression";
-
-std::string getNewExprName(const CXXNewExpr *NewExpr, const SourceManager &SM,
-                           const LangOptions &Lang) {
+static std::string getNewExprName(const CXXNewExpr *NewExpr,
+                                  const SourceManager &SM,
+                                  const LangOptions &Lang) {
  StringRef WrittenName = Lexer::getSourceText(
      CharSourceRange::getTokenRange(
          NewExpr->getAllocatedTypeSourceInfo()->getTypeLoc().getSourceRange()),
@ -34,8 +33,6 @@ std::string getNewExprName(const CXXNewExpr *NewExpr, const SourceManager &SM,
  return WrittenName.str();
 }

-} // namespace
-
 const char MakeSmartPtrCheck::PointerType[] = "pointerType";

 MakeSmartPtrCheck::MakeSmartPtrCheck(StringRef Name, ClangTidyContext *Context,
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
@ -19,9 +19,7 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::modernize {

-namespace {
-
-bool containsEscapes(StringRef HayStack, StringRef Escapes) {
+static bool containsEscapes(StringRef HayStack, StringRef Escapes) {
  size_t BackSlash = HayStack.find('\\');
  if (BackSlash == StringRef::npos)
    return false;
@ -35,16 +33,16 @@ bool containsEscapes(StringRef HayStack, StringRef Escapes) {
  return true;
 }

-bool isRawStringLiteral(StringRef Text) {
+static bool isRawStringLiteral(StringRef Text) {
  // Already a raw string literal if R comes before ".
  const size_t QuotePos = Text.find('"');
  assert(QuotePos != StringRef::npos);
  return (QuotePos > 0) && (Text[QuotePos - 1] == 'R');
 }

-bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
-                               const StringLiteral *Literal,
-                               const CharsBitSet &DisallowedChars) {
+static bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
+                                      const StringLiteral *Literal,
+                                      const CharsBitSet &DisallowedChars) {
  // FIXME: Handle L"", u8"", u"" and U"" literals.
  if (!Literal->isOrdinary())
    return false;
@ -64,14 +62,12 @@ bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
  return containsEscapes(Text, R"('\"?x01)");
 }

-bool containsDelimiter(StringRef Bytes, const std::string &Delimiter) {
+static bool containsDelimiter(StringRef Bytes, const std::string &Delimiter) {
  return Bytes.find(Delimiter.empty()
                        ? std::string(R"lit()")lit")
                        : (")" + Delimiter + R"(")")) != StringRef::npos;
 }

-} // namespace
-
 RawStringLiteralCheck::RawStringLiteralCheck(StringRef Name,
                                             ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
@ -29,12 +29,13 @@
 using namespace clang::ast_matchers;

 namespace clang::tidy::objc {
-namespace {

 static constexpr StringRef WeakText = "__weak";
 static constexpr StringRef StrongText = "__strong";
 static constexpr StringRef UnsafeUnretainedText = "__unsafe_unretained";

+namespace {
+
 /// Matches ObjCIvarRefExpr, DeclRefExpr, or MemberExpr that reference
 /// Objective-C object (or block) variables or fields whose object lifetimes
 /// are not __unsafe_unretained.
@ -49,6 +50,8 @@ AST_POLYMORPHIC_MATCHER(isObjCManagedLifetime,
         QT.getQualifiers().getObjCLifetime() > Qualifiers::OCL_ExplicitNone;
 }

+} // namespace
+
 static std::optional<FixItHint>
 fixItHintReplacementForOwnershipString(StringRef Text, CharSourceRange Range,
                                       StringRef Ownership) {
@ -93,8 +96,6 @@ fixItHintForVarDecl(const VarDecl *VD, const SourceManager &SM,
  return FixItHint::CreateInsertion(Range.getBegin(), "__unsafe_unretained ");
 }

-} // namespace
-
 void NSInvocationArgumentLifetimeCheck::registerMatchers(MatchFinder *Finder) {
  Finder->addMatcher(
      traverse(
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@ -27,11 +27,14 @@ enum NamingStyle {
  CategoryProperty = 2,
 };

+} // namespace
+
 /// For now we will only fix 'CamelCase' or 'abc_CamelCase' property to
 /// 'camelCase' or 'abc_camelCase'. For other cases the users need to
 /// come up with a proper name by their own.
 /// FIXME: provide fix for snake_case to snakeCase
-FixItHint generateFixItHint(const ObjCPropertyDecl *Decl, NamingStyle Style) {
+static FixItHint generateFixItHint(const ObjCPropertyDecl *Decl,
+                                   NamingStyle Style) {
  auto Name = Decl->getName();
  auto NewName = Decl->getName().str();
  size_t Index = 0;
@ -50,7 +53,7 @@ FixItHint generateFixItHint(const ObjCPropertyDecl *Decl, NamingStyle Style) {
  return {};
 }

-std::string validPropertyNameRegex(bool UsedInMatcher) {
+static std::string validPropertyNameRegex(bool UsedInMatcher) {
  // Allow any of these names:
  // foo
  // fooBar
@ -72,13 +75,13 @@ std::string validPropertyNameRegex(bool UsedInMatcher) {
  return StartMatcher + "([a-z]|[A-Z][A-Z0-9])[a-z0-9A-Z]*$";
 }

-bool hasCategoryPropertyPrefix(llvm::StringRef PropertyName) {
+static bool hasCategoryPropertyPrefix(llvm::StringRef PropertyName) {
  auto RegexExp =
      llvm::Regex("^[a-zA-Z][a-zA-Z0-9]*_[a-zA-Z0-9][a-zA-Z0-9_]+$");
  return RegexExp.match(PropertyName);
 }

-bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
+static bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
  size_t Start = PropertyName.find_first_of('_');
  assert(Start != llvm::StringRef::npos && Start + 1 < PropertyName.size());
  auto Prefix = PropertyName.substr(0, Start);
@ -88,7 +91,6 @@ bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
  auto RegexExp = llvm::Regex(llvm::StringRef(validPropertyNameRegex(false)));
  return RegexExp.match(PropertyName.substr(Start + 1));
 }
-} // namespace

 void PropertyDeclarationCheck::registerMatchers(MatchFinder *Finder) {
  Finder->addMatcher(objcPropertyDecl(
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@ -17,7 +17,6 @@
 #include <optional>

 namespace clang::tidy::performance {
-namespace {

 using namespace ::clang::ast_matchers;
 using llvm::StringRef;
@ -30,8 +29,8 @@ static constexpr StringRef MethodDeclId = "methodDecl";
 static constexpr StringRef FunctionDeclId = "functionDecl";
 static constexpr StringRef OldVarDeclId = "oldVarDecl";

-void recordFixes(const VarDecl &Var, ASTContext &Context,
-                 DiagnosticBuilder &Diagnostic) {
+static void recordFixes(const VarDecl &Var, ASTContext &Context,
+                        DiagnosticBuilder &Diagnostic) {
  Diagnostic << utils::fixit::changeVarDeclToReference(Var, Context);
  if (!Var.getType().isLocalConstQualified()) {
    if (std::optional<FixItHint> Fix = utils::fixit::addQualifierToVarDecl(
@ -40,8 +39,8 @@ void recordFixes(const VarDecl &Var, ASTContext &Context,
  }
 }

-std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
-                                                   SourceManager &SM) {
+static std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
+                                                          SourceManager &SM) {
  bool Invalid = false;
  const char *TextAfter = SM.getCharacterData(Loc, &Invalid);
  if (Invalid) {
@ -51,8 +50,8 @@ std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
  return Loc.getLocWithOffset(TextAfter[Offset] == '\0' ? Offset : Offset + 1);
 }

-void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
-                   DiagnosticBuilder &Diagnostic) {
+static void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
+                          DiagnosticBuilder &Diagnostic) {
  auto &SM = Context.getSourceManager();
  // Attempt to remove trailing comments as well.
  auto Tok = utils::lexer::findNextTokenSkippingComments(Stmt.getEndLoc(), SM,
@ -74,6 +73,8 @@ void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
  }
 }

+namespace {
+
 AST_MATCHER_FUNCTION_P(StatementMatcher,
                       isRefReturningMethodCallWithConstOverloads,
                       std::vector<StringRef>, ExcludedContainerTypes) {
@ -130,6 +131,8 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
                                           hasUnaryOperand(OldVarDeclRef)))));
 }

+} // namespace
+
 // This checks that the variable itself is only used as const, and also makes
 // sure that it does not reference another variable that could be modified in
 // the BlockStmt. It does this by checking the following:
@ -180,13 +183,13 @@ static bool isInitializingVariableImmutable(
  return false;
 }

-bool isVariableUnused(const VarDecl &Var, const Stmt &BlockStmt,
-                      ASTContext &Context) {
+static bool isVariableUnused(const VarDecl &Var, const Stmt &BlockStmt,
+                             ASTContext &Context) {
  return allDeclRefExprs(Var, BlockStmt, Context).empty();
 }

-const SubstTemplateTypeParmType *getSubstitutedType(const QualType &Type,
-                                                    ASTContext &Context) {
+static const SubstTemplateTypeParmType *
+getSubstitutedType(const QualType &Type, ASTContext &Context) {
  auto Matches = match(
      qualType(anyOf(substTemplateTypeParmType().bind("subst"),
                     hasDescendant(substTemplateTypeParmType().bind("subst")))),
@ -194,9 +197,9 @@ const SubstTemplateTypeParmType *getSubstitutedType(const QualType &Type,
  return selectFirst<SubstTemplateTypeParmType>("subst", Matches);
 }

-bool differentReplacedTemplateParams(const QualType &VarType,
-                                     const QualType &InitializerType,
-                                     ASTContext &Context) {
+static bool differentReplacedTemplateParams(const QualType &VarType,
+                                            const QualType &InitializerType,
+                                            ASTContext &Context) {
  if (const SubstTemplateTypeParmType *VarTmplType =
          getSubstitutedType(VarType, Context)) {
    if (const SubstTemplateTypeParmType *InitializerTmplType =
@ -212,8 +215,8 @@ bool differentReplacedTemplateParams(const QualType &VarType,
  return false;
 }

-QualType constructorArgumentType(const VarDecl *OldVar,
-                                 const BoundNodes &Nodes) {
+static QualType constructorArgumentType(const VarDecl *OldVar,
+                                        const BoundNodes &Nodes) {
  if (OldVar) {
    return OldVar->getType();
  }
@ -224,8 +227,6 @@ QualType constructorArgumentType(const VarDecl *OldVar,
  return MethodDecl->getReturnType();
 }

-} // namespace
-
 UnnecessaryCopyInitialization::UnnecessaryCopyInitialization(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@ -21,16 +21,14 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::performance {

-namespace {
-
-std::string paramNameOrIndex(StringRef Name, size_t Index) {
+static std::string paramNameOrIndex(StringRef Name, size_t Index) {
  return (Name.empty() ? llvm::Twine('#') + llvm::Twine(Index + 1)
                       : llvm::Twine('\'') + Name + llvm::Twine('\''))
      .str();
 }

-bool hasLoopStmtAncestor(const DeclRefExpr &DeclRef, const Decl &Decl,
-                         ASTContext &Context) {
+static bool hasLoopStmtAncestor(const DeclRefExpr &DeclRef, const Decl &Decl,
+                                ASTContext &Context) {
  auto Matches = match(
      traverse(TK_AsIs,
               decl(forEachDescendant(declRefExpr(
@ -41,8 +39,6 @@ bool hasLoopStmtAncestor(const DeclRefExpr &DeclRef, const Decl &Decl,
  return Matches.empty();
 }

-} // namespace
-
 UnnecessaryValueParamCheck::UnnecessaryValueParamCheck(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
@ -122,15 +122,15 @@ AST_MATCHER(EnumDecl, hasSequentialInitialValues) {
  return !AllEnumeratorsArePowersOfTwo;
 }

-std::string getName(const EnumDecl *Decl) {
+} // namespace
+
+static std::string getName(const EnumDecl *Decl) {
  if (!Decl->getDeclName())
    return "<unnamed>";

  return Decl->getQualifiedNameAsString();
 }

-} // namespace
-
 EnumInitialValueCheck::EnumInitialValueCheck(StringRef Name,
                                             ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
@ -144,6 +144,8 @@ struct CognitiveComplexity final {
  void account(SourceLocation Loc, unsigned short Nesting, Criteria C);
 };

+} // namespace
+
 // All the possible messages that can be output. The choice of the message
 // to use is based of the combination of the CognitiveComplexity::Criteria.
 // It would be nice to have it in CognitiveComplexity struct, but then it is
@ -163,23 +165,27 @@ static const std::array<const StringRef, 4> Msgs = {{
 }};

 // Criteria is a bitset, thus a few helpers are needed.
-CognitiveComplexity::Criteria operator|(CognitiveComplexity::Criteria LHS,
-                                        CognitiveComplexity::Criteria RHS) {
+static CognitiveComplexity::Criteria
+operator|(CognitiveComplexity::Criteria LHS,
+          CognitiveComplexity::Criteria RHS) {
  return static_cast<CognitiveComplexity::Criteria>(llvm::to_underlying(LHS) |
                                                    llvm::to_underlying(RHS));
 }
-CognitiveComplexity::Criteria operator&(CognitiveComplexity::Criteria LHS,
-                                        CognitiveComplexity::Criteria RHS) {
+static CognitiveComplexity::Criteria
+operator&(CognitiveComplexity::Criteria LHS,
+          CognitiveComplexity::Criteria RHS) {
  return static_cast<CognitiveComplexity::Criteria>(llvm::to_underlying(LHS) &
                                                    llvm::to_underlying(RHS));
 }
-CognitiveComplexity::Criteria &operator|=(CognitiveComplexity::Criteria &LHS,
-                                          CognitiveComplexity::Criteria RHS) {
+static CognitiveComplexity::Criteria &
+operator|=(CognitiveComplexity::Criteria &LHS,
+           CognitiveComplexity::Criteria RHS) {
  LHS = operator|(LHS, RHS);
  return LHS;
 }
-CognitiveComplexity::Criteria &operator&=(CognitiveComplexity::Criteria &LHS,
-                                          CognitiveComplexity::Criteria RHS) {
+static CognitiveComplexity::Criteria &
+operator&=(CognitiveComplexity::Criteria &LHS,
+           CognitiveComplexity::Criteria RHS) {
  LHS = operator&(LHS, RHS);
  return LHS;
 }
@ -199,6 +205,8 @@ void CognitiveComplexity::account(SourceLocation Loc, unsigned short Nesting,
  Total += Increase;
 }

+namespace {
+
 class FunctionASTVisitor final
    : public RecursiveASTVisitor<FunctionASTVisitor> {
  using Base = RecursiveASTVisitor<FunctionASTVisitor>;
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@ -41,9 +41,11 @@ AST_MATCHER(Stmt, isNULLMacroExpansion) {
  return isNULLMacroExpansion(&Node, Finder->getASTContext());
 }

-StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
-                                             QualType Type,
-                                             ASTContext &Context) {
+} // namespace
+
+static StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
+                                                    QualType Type,
+                                                    ASTContext &Context) {
  switch (CastExprKind) {
  case CK_IntegralToBoolean:
    return Type->isUnsignedIntegerType() ? "0u" : "0";
@ -62,15 +64,15 @@ StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
  }
 }

-bool isUnaryLogicalNotOperator(const Stmt *Statement) {
+static bool isUnaryLogicalNotOperator(const Stmt *Statement) {
  const auto *UnaryOperatorExpr = dyn_cast<UnaryOperator>(Statement);
  return UnaryOperatorExpr && UnaryOperatorExpr->getOpcode() == UO_LNot;
 }

-void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
-                              const ImplicitCastExpr *Cast, const Stmt *Parent,
-                              ASTContext &Context,
-                              bool UseUpperCaseLiteralSuffix) {
+static void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
+                                     const ImplicitCastExpr *Cast,
+                                     const Stmt *Parent, ASTContext &Context,
+                                     bool UseUpperCaseLiteralSuffix) {
  // In case of expressions like (! integer), we should remove the redundant not
  // operator and use inverted comparison (integer == 0).
  bool InvertComparison =
@ -133,8 +135,8 @@ void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
  Diag << FixItHint::CreateInsertion(EndLoc, EndLocInsertion);
 }

-StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
-                                          ASTContext &Context) {
+static StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
+                                                 ASTContext &Context) {
  if (isNULLMacroExpansion(Expression, Context)) {
    return "false";
  }
@ -161,7 +163,7 @@ StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
  return {};
 }

-bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
+static bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
  SourceRange PrefixRange(Loc.getLocWithOffset(-1), Loc);
  StringRef SpaceBeforeStmtStr = Lexer::getSourceText(
      CharSourceRange::getCharRange(PrefixRange), Context.getSourceManager(),
@ -173,9 +175,10 @@ bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
  return !AllowedCharacters.contains(SpaceBeforeStmtStr.back());
 }

-void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
-                                const ImplicitCastExpr *Cast,
-                                ASTContext &Context, StringRef OtherType) {
+static void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
+                                       const ImplicitCastExpr *Cast,
+                                       ASTContext &Context,
+                                       StringRef OtherType) {
  if (!Context.getLangOpts().CPlusPlus) {
    Diag << FixItHint::CreateInsertion(Cast->getBeginLoc(),
                                       (Twine("(") + OtherType + ")").str());
@ -200,8 +203,9 @@ void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
  }
 }

-StringRef getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
-                                      QualType DestType, ASTContext &Context) {
+static StringRef
+getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
+                            QualType DestType, ASTContext &Context) {
  // Prior to C++11, false literal could be implicitly converted to pointer.
  if (!Context.getLangOpts().CPlusPlus11 &&
      (DestType->isPointerType() || DestType->isMemberPointerType()) &&
@ -222,8 +226,8 @@ StringRef getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
  return BoolLiteral->getValue() ? "1" : "0";
 }

-bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
-                              ASTContext &Context) {
+static bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
+                                     ASTContext &Context) {
  std::queue<const Stmt *> Q;
  Q.push(Cast);

@ -251,8 +255,6 @@ bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
  return false;
 }

-} // anonymous namespace
-
 ImplicitBoolConversionCheck::ImplicitBoolConversionCheck(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@ -28,8 +28,11 @@ AST_MATCHER_P(QualType, hasUnqualifiedType,

 enum class Qualifier { Const, Volatile, Restrict };

-std::optional<Token> findQualToken(const VarDecl *Decl, Qualifier Qual,
-                                   const MatchFinder::MatchResult &Result) {
+} // namespace
+
+static std::optional<Token>
+findQualToken(const VarDecl *Decl, Qualifier Qual,
+              const MatchFinder::MatchResult &Result) {
  // Since either of the locs can be in a macro, use `makeFileCharRange` to be
  // sure that we have a consistent `CharSourceRange`, located entirely in the
  // source file.
@ -58,7 +61,7 @@ std::optional<Token> findQualToken(const VarDecl *Decl, Qualifier Qual,
                                          *Result.SourceManager);
 }

-std::optional<SourceRange>
+static std::optional<SourceRange>
 getTypeSpecifierLocation(const VarDecl *Var,
                         const MatchFinder::MatchResult &Result) {
  SourceRange TypeSpecifier(
@ -73,8 +76,8 @@ getTypeSpecifierLocation(const VarDecl *Var,
  return TypeSpecifier;
 }

-std::optional<SourceRange> mergeReplacementRange(SourceRange &TypeSpecifier,
-                                                 const Token &ConstToken) {
+static std::optional<SourceRange>
+mergeReplacementRange(SourceRange &TypeSpecifier, const Token &ConstToken) {
  if (TypeSpecifier.getBegin().getLocWithOffset(-1) == ConstToken.getEndLoc()) {
    TypeSpecifier.setBegin(ConstToken.getLocation());
    return std::nullopt;
@ -86,21 +89,19 @@ std::optional<SourceRange> mergeReplacementRange(SourceRange &TypeSpecifier,
  return SourceRange(ConstToken.getLocation(), ConstToken.getEndLoc());
 }

-bool isPointerConst(QualType QType) {
+static bool isPointerConst(QualType QType) {
  QualType Pointee = QType->getPointeeType();
  assert(!Pointee.isNull() && "can't have a null Pointee");
  return Pointee.isConstQualified();
 }

-bool isAutoPointerConst(QualType QType) {
+static bool isAutoPointerConst(QualType QType) {
  QualType Pointee =
      cast<AutoType>(QType->getPointeeType().getTypePtr())->desugar();
  assert(!Pointee.isNull() && "can't have a null Pointee");
  return Pointee.isConstQualified();
 }

-} // namespace
-
 QualifiedAutoCheck::QualifiedAutoCheck(StringRef Name,
                                       ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
@ -14,19 +14,18 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::readability {

-namespace {
+static const char *const RedundantReturnDiag =
+    "redundant return statement at the end "
+    "of a function with a void return type";
+static const char *const RedundantContinueDiag =
+    "redundant continue statement at the "
+    "end of loop statement";

-const char *const RedundantReturnDiag = "redundant return statement at the end "
-                                        "of a function with a void return type";
-const char *const RedundantContinueDiag = "redundant continue statement at the "
-                                          "end of loop statement";
-
-bool isLocationInMacroExpansion(const SourceManager &SM, SourceLocation Loc) {
+static bool isLocationInMacroExpansion(const SourceManager &SM,
+                                       SourceLocation Loc) {
  return SM.isMacroBodyExpansion(Loc) || SM.isMacroArgExpansion(Loc);
 }

-} // namespace
-
 void RedundantControlFlowCheck::registerMatchers(MatchFinder *Finder) {
  Finder->addMatcher(
      functionDecl(isDefinition(), returns(voidType()),
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
@ -13,16 +13,14 @@

 namespace clang::tidy::utils::type_traits {

-namespace {
-
-bool classHasTrivialCopyAndDestroy(QualType Type) {
+static bool classHasTrivialCopyAndDestroy(QualType Type) {
  auto *Record = Type->getAsCXXRecordDecl();
  return Record && Record->hasDefinition() &&
         !Record->hasNonTrivialCopyConstructor() &&
         !Record->hasNonTrivialDestructor();
 }

-bool hasDeletedCopyConstructor(QualType Type) {
+static bool hasDeletedCopyConstructor(QualType Type) {
  auto *Record = Type->getAsCXXRecordDecl();
  if (!Record || !Record->hasDefinition())
    return false;
@ -33,8 +31,6 @@ bool hasDeletedCopyConstructor(QualType Type) {
  return false;
 }

-} // namespace
-
 std::optional<bool> isExpensiveToCopy(QualType Type,
                                      const ASTContext &Context) {
  if (Type->isDependentType() || Type->isIncompleteType())
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -309,6 +309,13 @@ NVPTX Support

 X86 Support
 ^^^^^^^^^^^
+- More SSE, AVX and AVX512 intrinsics, including initializers and general
+  arithmetic can now be used in C++ constant expressions.
+- Some SSE, AVX and AVX512 intrinsics have been converted to wrap
+  generic __builtin intrinsics.
+- NOTE: Please avoid use of the __builtin_ia32_* intrinsics - these are not 
+  guaranteed to exist in future releases, or match behaviour with previous
+  releases of clang or other compilers.

 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@ -627,11 +627,23 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
-}

-let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
  def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+
+  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
 }

 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@ -654,46 +666,6 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
  def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
 }

-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-}
-
-let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-}
-
 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
  def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
 }
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@ -11669,13 +11669,24 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
  case clang::X86::BI__builtin_ia32_pmulhuw512:
  case clang::X86::BI__builtin_ia32_pmulhw128:
  case clang::X86::BI__builtin_ia32_pmulhw256:
-  case clang::X86::BI__builtin_ia32_pmulhw512: {
+  case clang::X86::BI__builtin_ia32_pmulhw512:
+  case clang::X86::BI__builtin_ia32_psllv2di:
+  case clang::X86::BI__builtin_ia32_psllv4di:
+  case clang::X86::BI__builtin_ia32_psllv4si:
+  case clang::X86::BI__builtin_ia32_psllv8si:
+  case clang::X86::BI__builtin_ia32_psrav4si:
+  case clang::X86::BI__builtin_ia32_psrav8si:
+  case clang::X86::BI__builtin_ia32_psrlv2di:
+  case clang::X86::BI__builtin_ia32_psrlv4di:
+  case clang::X86::BI__builtin_ia32_psrlv4si:
+  case clang::X86::BI__builtin_ia32_psrlv8si:{
    APValue SourceLHS, SourceRHS;
    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
      return false;

    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
    unsigned SourceLen = SourceLHS.getVectorLength();
    SmallVector<APValue, 4> ResultElements;
    ResultElements.reserve(SourceLen);
@ -11687,12 +11698,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
      case Builtin::BI__builtin_elementwise_add_sat:
        ResultElements.push_back(APValue(
            APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS),
-                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+                   DestUnsigned)));
        break;
      case Builtin::BI__builtin_elementwise_sub_sat:
        ResultElements.push_back(APValue(
            APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
-                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
+                   DestUnsigned)));
        break;
      case clang::X86::BI__builtin_ia32_pmulhuw128:
      case clang::X86::BI__builtin_ia32_pmulhuw256:
@ -11706,6 +11717,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
        ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS),
                                                /*isUnsigned=*/false)));
        break;
+      case clang::X86::BI__builtin_ia32_psllv2di:
+      case clang::X86::BI__builtin_ia32_psllv4di:
+      case clang::X86::BI__builtin_ia32_psllv4si:
+      case clang::X86::BI__builtin_ia32_psllv8si:
+        if (RHS.uge(RHS.getBitWidth())) {
+          ResultElements.push_back(
+              APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
+          break;
+        }
+        ResultElements.push_back(
+            APValue(APSInt(LHS.shl(RHS.getZExtValue()), DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_psrav4si:
+      case clang::X86::BI__builtin_ia32_psrav8si:
+        if (RHS.uge(RHS.getBitWidth())) {
+          ResultElements.push_back(
+              APValue(APSInt(LHS.ashr(RHS.getBitWidth() - 1), DestUnsigned)));
+          break;
+        }
+        ResultElements.push_back(
+            APValue(APSInt(LHS.ashr(RHS.getZExtValue()), DestUnsigned)));
+        break;
+      case clang::X86::BI__builtin_ia32_psrlv2di:
+      case clang::X86::BI__builtin_ia32_psrlv4di:
+      case clang::X86::BI__builtin_ia32_psrlv4si:
+      case clang::X86::BI__builtin_ia32_psrlv8si:
+        if (RHS.uge(RHS.getBitWidth())) {
+          ResultElements.push_back(
+              APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
+          break;
+        }
+        ResultElements.push_back(
+            APValue(APSInt(LHS.lshr(RHS.getZExtValue()), DestUnsigned)));
+        break;
      }
    }

--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@ -3721,7 +3721,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
@ -3743,7 +3743,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_sllv_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
@ -3765,7 +3765,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
@ -3787,7 +3787,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_sllv_epi64(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
@ -3810,7 +3810,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_srav_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
@ -3833,7 +3833,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_srav_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
@ -3855,7 +3855,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
@ -3877,7 +3877,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_srlv_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
@ -3899,7 +3899,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
@ -3921,7 +3921,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
+static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_srlv_epi64(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@ -327,7 +327,6 @@ __m256i test_mm256_cvtepi8_epi16(__m128i a) {
  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepi8_epi16(a);
 }
-
 TEST_CONSTEXPR(match_v16hi(_mm256_cvtepi8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12));

 __m256i test_mm256_cvtepi8_epi32(__m128i a) {
@ -336,7 +335,6 @@ __m256i test_mm256_cvtepi8_epi32(__m128i a) {
  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
  return _mm256_cvtepi8_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepi8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4));

 __m256i test_mm256_cvtepi8_epi64(__m128i a) {
@ -345,7 +343,6 @@ __m256i test_mm256_cvtepi8_epi64(__m128i a) {
  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
  return _mm256_cvtepi8_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0));

 __m256i test_mm256_cvtepi16_epi32(__m128i a) {
@ -353,7 +350,6 @@ __m256i test_mm256_cvtepi16_epi32(__m128i a) {
  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
  return _mm256_cvtepi16_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepi16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0, 1, -2, 3, -4));

 __m256i test_mm256_cvtepi16_epi64(__m128i a) {
@ -362,7 +358,6 @@ __m256i test_mm256_cvtepi16_epi64(__m128i a) {
  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
  return _mm256_cvtepi16_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0));

 __m256i test_mm256_cvtepi32_epi64(__m128i a) {
@ -370,7 +365,6 @@ __m256i test_mm256_cvtepi32_epi64(__m128i a) {
  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepi32_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), -70000, 2, -1, 0));

 __m256i test_mm256_cvtepu8_epi16(__m128i a) {
@ -378,7 +372,6 @@ __m256i test_mm256_cvtepu8_epi16(__m128i a) {
  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepu8_epi16(a);
 }
-
 TEST_CONSTEXPR(match_v16hi(_mm256_cvtepu8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244));

 __m256i test_mm256_cvtepu8_epi32(__m128i a) {
@ -387,7 +380,6 @@ __m256i test_mm256_cvtepu8_epi32(__m128i a) {
  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
  return _mm256_cvtepu8_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepu8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252));

 __m256i test_mm256_cvtepu8_epi64(__m128i a) {
@ -396,7 +388,6 @@ __m256i test_mm256_cvtepu8_epi64(__m128i a) {
  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
  return _mm256_cvtepu8_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0));

 __m256i test_mm256_cvtepu16_epi32(__m128i a) {
@ -404,7 +395,6 @@ __m256i test_mm256_cvtepu16_epi32(__m128i a) {
  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
  return _mm256_cvtepu16_epi32(a);
 }
-
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepu16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0, 1, 65534, 3, 65532));

 __m256i test_mm256_cvtepu16_epi64(__m128i a) {
@ -413,7 +403,6 @@ __m256i test_mm256_cvtepu16_epi64(__m128i a) {
  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
  return _mm256_cvtepu16_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0));

 __m256i test_mm256_cvtepu32_epi64(__m128i a) {
@ -421,7 +410,6 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {
  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepu32_epi64(a);
 }
-
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), 4294897296, 2, 4294967295, 0));

 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
@ -1120,24 +1108,28 @@ __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_sllv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_sllv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 2, -8, 24, 0));

 __m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_sllv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_sllv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_sllv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 2, -8, 24, -64, 0, 0, 0, 0));

 __m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_sllv_epi64
  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_sllv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m128i(_mm_sllv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 256, 0x8000000000000000ULL));

 __m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_sllv_epi64
  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
  return _mm256_sllv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_sllv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 2, -8, 24, 0));

 __m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
  // CHECK-LABEL: test_mm256_sra_epi16
@ -1180,12 +1172,14 @@ __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_srav_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_srav_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, -1, 0, -1));

 __m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srav_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srav_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_srav_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, -1, 0, -1, 0, -1, 0, -1));

 __m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
  // CHECK-LABEL: test_mm256_srl_epi16
@ -1252,24 +1246,28 @@ __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_srlv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_srlv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, 1073741823, 0, 0));

 __m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srlv_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_srlv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, 1073741823, 0, 268435455, 0, 1, 0, 7));

 __m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_srlv_epi64
  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_srlv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m128i(_mm_srlv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 0, 1));

 __m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi64
  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
  return _mm256_srlv_epi64(a, b);
 }
+TEST_CONSTEXPR(match_m256i(_mm256_srlv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 0, 0x3FFFFFFFFFFFFFFFULL, 0, 0));

 __m256i test_mm256_stream_load_si256(__m256i const *a) {
  // CHECK-LABEL: test_mm256_stream_load_si256
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@ -106,6 +106,12 @@ if(MSVC)
 endif()
 set(ASAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})

+# Win/ASan relies on the runtime functions being hotpatchable. See
+# https://github.com/llvm/llvm-project/pull/149444
+if(MSVC)
+  list(APPEND ASAN_CFLAGS /hotpatch)
+endif()
+
 append_list_if(MSVC /Zl ASAN_CFLAGS)

 set(ASAN_COMMON_DEFINITIONS "")
--- a/compiler-rt/lib/dfsan/dfsan.cpp
+++ b/compiler-rt/lib/dfsan/dfsan.cpp
@ -792,7 +792,7 @@ static void PrintNoOriginTrackingWarning() {

 static void PrintNoTaintWarning(const void *address) {
  Decorator d;
-  Printf("  %sDFSan: no tainted value at %zx%s\n", d.Warning(), (uptr)address,
+  Printf("  %sDFSan: no tainted value at %x%s\n", d.Warning(), address,
         d.Default());
 }

--- a/compiler-rt/lib/hwasan/hwasan.cpp
+++ b/compiler-rt/lib/hwasan/hwasan.cpp
@ -176,7 +176,7 @@ static void HwasanFormatMemoryUsage(InternalScopedString &s) {
      "HWASAN pid: %d rss: %zd threads: %zd stacks: %zd"
      " thr_aux: %zd stack_depot: %zd uniq_stacks: %zd"
      " heap: %zd",
-      (int)internal_getpid(), GetRSS(), thread_stats.n_live_threads,
+      internal_getpid(), GetRSS(), thread_stats.n_live_threads,
      thread_stats.total_stack_size,
      thread_stats.n_live_threads * thread_list.MemoryUsedPerThread(),
      sds.allocated, sds.n_uniq_ids, asc[AllocatorStatMapped]);
@ -692,7 +692,7 @@ void __hwasan_handle_longjmp(const void *sp_dst) {
        "WARNING: HWASan is ignoring requested __hwasan_handle_longjmp: "
        "stack top: %p; target %p; distance: %p (%zd)\n"
        "False positive error reports may follow\n",
-        (void *)sp, (void *)dst, (void *)(dst - sp), dst - sp);
+        (void *)sp, (void *)dst, dst - sp, dst - sp);
    return;
  }
  TagMemory(sp, dst - sp, 0);
--- a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h
+++ b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h
@ -41,7 +41,7 @@ static inline bool malloc_bisect(StackTrace *stack, uptr orig_size) {
  if (h < left || h > right)
    return false;
  if (flags()->malloc_bisect_dump) {
-    Printf("[alloc] %u %zu\n", (u32)h, orig_size);
+    Printf("[alloc] %u %zu\n", h, orig_size);
    stack->Print();
  }
  return true;
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@ -306,9 +306,8 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
          "%p is located %zd bytes %s a %zd-byte local variable %s "
          "[%p,%p) "
          "in %s %s\n",
-          (void *)untagged_addr, offset, whence, local.size, local.name,
-          (void *)best_beg, (void *)(best_beg + local.size),
-          local.function_name, location.data());
+          untagged_addr, offset, whence, local.size, local.name, best_beg,
+          best_beg + local.size, local.function_name, location.data());
      location.clear();
      Printf("%s\n", d.Default());
    }
@ -739,8 +738,8 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
    Printf("%s", d.Location());
    Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
           untagged_addr, offset, whence,
-           candidate.heap.end - candidate.heap.begin,
-           (void *)candidate.heap.begin, (void *)candidate.heap.end);
+           candidate.heap.end - candidate.heap.begin, candidate.heap.begin,
+           candidate.heap.end);
    Printf("%s", d.Allocation());
    Printf("allocated by thread T%u here:\n", candidate.heap.thread_id);
    Printf("%s", d.Default());
@ -763,11 +762,11 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
      Printf(
          "%p is located %zd bytes %s a %zd-byte global variable "
          "%s [%p,%p) in %s\n",
-          (void *)untagged_addr,
+          untagged_addr,
          candidate.after ? untagged_addr - (info.start + info.size)
                          : info.start - untagged_addr,
          candidate.after ? "after" : "before", info.size, info.name,
-          (void *)info.start, (void *)(info.start + info.size), module_name);
+          info.start, info.start + info.size, module_name);
    } else {
      uptr size = GetGlobalSizeFromDescriptor(candidate.untagged_addr);
      if (size == 0)
@ -775,14 +774,14 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
        Printf(
            "%p is located %s a global variable in "
            "\n    #0 0x%x (%s+0x%x)\n",
-            (void *)untagged_addr, candidate.after ? "after" : "before",
-            (void *)candidate.untagged_addr, module_name, (u32)module_address);
+            untagged_addr, candidate.after ? "after" : "before",
+            candidate.untagged_addr, module_name, module_address);
      else
        Printf(
            "%p is located %s a %zd-byte global variable in "
            "\n    #0 0x%x (%s+0x%x)\n",
-            (void *)untagged_addr, candidate.after ? "after" : "before", size,
-            (void *)candidate.untagged_addr, module_name, (u32)module_address);
+            untagged_addr, candidate.after ? "after" : "before", size,
+            candidate.untagged_addr, module_name, module_address);
    }
    Printf("%s", d.Default());
  }
@ -793,8 +792,8 @@ void BaseReport::PrintAddressDescription() const {
  int num_descriptions_printed = 0;

  if (MemIsShadow(untagged_addr)) {
-    Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(),
-           (void *)untagged_addr, d.Default());
+    Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), untagged_addr,
+           d.Default());
    return;
  }

@ -803,7 +802,7 @@ void BaseReport::PrintAddressDescription() const {
    Printf(
        "%s[%p,%p) is a %s %s heap chunk; "
        "size: %zd offset: %zd\n%s",
-        d.Location(), (void *)heap.begin, (void *)(heap.begin + heap.size),
+        d.Location(), heap.begin, heap.begin + heap.size,
        heap.from_small_heap ? "small" : "large",
        heap.is_allocated ? "allocated" : "unallocated", heap.size,
        untagged_addr - heap.begin, d.Default());
@ -822,8 +821,8 @@ void BaseReport::PrintAddressDescription() const {
    Printf("%s", d.Error());
    Printf("\nCause: stack tag-mismatch\n");
    Printf("%s", d.Location());
-    Printf("Address %p is located in stack of thread T%zd\n",
-           (void *)untagged_addr, (ssize)sa.thread_id());
+    Printf("Address %p is located in stack of thread T%zd\n", untagged_addr,
+           sa.thread_id());
    Printf("%s", d.Default());
    announce_by_id(sa.thread_id());
    PrintStackAllocations(sa.get(), ptr_tag, untagged_addr);
@ -843,9 +842,9 @@ void BaseReport::PrintAddressDescription() const {
    Printf("\nCause: use-after-free\n");
    Printf("%s", d.Location());
    Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n",
-           (void *)untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
-           (ssize)har.requested_size, UntagAddr(har.tagged_addr),
-           (void *)(UntagAddr(har.tagged_addr) + har.requested_size));
+           untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
+           har.requested_size, UntagAddr(har.tagged_addr),
+           UntagAddr(har.tagged_addr) + har.requested_size);
    Printf("%s", d.Allocation());
    Printf("freed by thread T%u here:\n", ha.free_thread_id);
    Printf("%s", d.Default());
@ -859,7 +858,7 @@ void BaseReport::PrintAddressDescription() const {
    // Print a developer note: the index of this heap object
    // in the thread's deallocation ring buffer.
    Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", ha.ring_index + 1,
-           (ssize)flags()->heap_history_size);
+           flags()->heap_history_size);
    Printf("hwasan_dev_note_num_matching_addrs: %zd\n", ha.num_matching_addrs);
    Printf("hwasan_dev_note_num_matching_addrs_4b: %zd\n",
           ha.num_matching_addrs_4b);
@ -916,11 +915,10 @@ InvalidFreeReport::~InvalidFreeReport() {
  const Thread *thread = GetCurrentThread();
  if (thread) {
    Report("ERROR: %s: %s on address %p at pc %p on thread T%zd\n",
-           SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc,
-           (ssize)thread->unique_id());
+           SanitizerToolName, bug_type, untagged_addr, pc, thread->unique_id());
  } else {
    Report("ERROR: %s: %s on address %p at pc %p on unknown thread\n",
-           SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc);
+           SanitizerToolName, bug_type, untagged_addr, pc);
  }
  Printf("%s", d.Access());
  if (shadow.addr) {
@ -969,8 +967,7 @@ TailOverwrittenReport::~TailOverwrittenReport() {
  Printf("%s", d.Error());
  const char *bug_type = "allocation-tail-overwritten";
  Report("ERROR: %s: %s; heap object [%p,%p) of size %zd\n", SanitizerToolName,
-         bug_type, (void *)untagged_addr, (void *)(untagged_addr + orig_size),
-         orig_size);
+         bug_type, untagged_addr, untagged_addr + orig_size, orig_size);
  Printf("\n%s", d.Default());
  Printf(
      "Stack of invalid access unknown. Issue detected at deallocation "
@ -1040,7 +1037,7 @@ TagMismatchReport::~TagMismatchReport() {
  uptr pc = GetTopPc(stack);
  Printf("%s", d.Error());
  Report("ERROR: %s: %s on address %p at pc %p\n", SanitizerToolName, bug_type,
-         (void *)untagged_addr, (void *)pc);
+         untagged_addr, pc);

  Thread *t = GetCurrentThread();

@ -1052,12 +1049,12 @@ TagMismatchReport::~TagMismatchReport() {
        GetShortTagCopy(MemToShadow(untagged_addr + mismatch_offset));
    Printf(
        "%s of size %zu at %p tags: %02x/%02x(%02x) (ptr/mem) in thread T%zd\n",
-        is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr,
-        ptr_tag, mem_tag, short_tag, (ssize)t->unique_id());
+        is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag,
+        mem_tag, short_tag, t->unique_id());
  } else {
    Printf("%s of size %zu at %p tags: %02x/%02x (ptr/mem) in thread T%zd\n",
-           is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr,
-           ptr_tag, mem_tag, (ssize)t->unique_id());
+           is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag,
+           mem_tag, t->unique_id());
  }
  if (mismatch_offset)
    Printf("Invalid access starting at offset %zu\n", mismatch_offset);
@ -1096,7 +1093,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size,
 // See the frame breakdown defined in __hwasan_tag_mismatch (from
 // hwasan_tag_mismatch_{aarch64,riscv64}.S).
 void ReportRegisters(const uptr *frame, uptr pc) {
-  Printf("\nRegisters where the failure occurred (pc %p):\n", (void *)pc);
+  Printf("\nRegisters where the failure occurred (pc %p):\n", pc);

  // We explicitly print a single line (4 registers/line) each iteration to
  // reduce the amount of logcat error messages printed. Each Printf() will
--- a/compiler-rt/lib/hwasan/hwasan_thread.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp
@ -173,10 +173,9 @@ uptr Thread::stack_size() {
 }

 void Thread::Print(const char *Prefix) {
-  Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix,
-         (ssize_t)unique_id_, (void *)this, (void *)stack_bottom(),
-         (void *)stack_top(), stack_top() - stack_bottom(), (void *)tls_begin(),
-         (void *)tls_end());
+  Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix, unique_id_,
+         (void *)this, stack_bottom(), stack_top(),
+         stack_top() - stack_bottom(), tls_begin(), tls_end());
 }

 static u32 xorshift(u32 state) {
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@ -806,7 +806,7 @@ static bool ReportUnsuspendedThreads(
      succeded = false;
      Report(
          "Running thread %zu was not suspended. False leaks are possible.\n",
-          (usize)os_id);
+          os_id);
    }
  }
  return succeded;
--- a/compiler-rt/lib/memprof/memprof_shadow_setup.cpp
+++ b/compiler-rt/lib/memprof/memprof_shadow_setup.cpp
@ -29,7 +29,7 @@ static void ProtectGap(uptr addr, uptr size) {
      Printf("protect_shadow_gap=0:"
             " not protecting shadow gap, allocating gap's shadow\n"
             "|| `[%p, %p]` || ShadowGap's shadow ||\n",
-             (void *)GapShadowBeg, (void *)GapShadowEnd);
+             GapShadowBeg, GapShadowEnd);
    ReserveShadowMemoryRange(GapShadowBeg, GapShadowEnd,
                             "unprotected gap shadow");
    return;
--- a/compiler-rt/lib/xray/xray_init.cpp
+++ b/compiler-rt/lib/xray/xray_init.cpp
@ -105,7 +105,7 @@ __xray_register_sleds(const XRaySledEntry *SledsBegin,
  }

  if (Verbosity())
-    Report("Registering %d new functions!\n", (int)SledMap.Functions);
+    Report("Registering %d new functions!\n", SledMap.Functions);

  {
    SpinMutexLock Guard(&XRayInstrMapMutex);
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@ -308,8 +308,7 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) {
    return XRayPatchingStatus::NOT_INITIALIZED;

  if (Verbosity())
-    Report("Patching object %d with %d functions.\n", ObjId,
-           (int)InstrMap.Entries);
+    Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries);

  // Check if the corresponding DSO has been unloaded.
  if (!InstrMap.Loaded) {
--- a/libc/config/gpu/nvptx/entrypoints.txt
+++ b/libc/config/gpu/nvptx/entrypoints.txt
@ -280,6 +280,7 @@ set(TARGET_LIBC_ENTRYPOINTS

 set(TARGET_LIBM_ENTRYPOINTS
    # math.h entrypoints
+    libc.src.math.acos
    libc.src.math.acosf
    libc.src.math.acoshf
    libc.src.math.asin
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@ -2432,14 +2432,6 @@ functions:
    return_type: double
    arguments:
      - type: double
-  - name: sincosf
-    standards:
-      - gnu
-    return_type: void
-    arguments:
-      - type: float
-      - type: float *
-      - type: float *
  - name: sinf
    standards:
      - stdc
@ -2453,6 +2445,22 @@ functions:
    arguments:
      - type: _Float16
    guard: LIBC_TYPES_HAS_FLOAT16
+  - name: sincos
+    standards:
+      - gnu
+    return_type: void
+    arguments:
+      - type: double
+      - type: double *
+      - type: double *
+  - name: sincosf
+    standards:
+      - gnu
+    return_type: void
+    arguments:
+      - type: float
+      - type: float *
+      - type: float *
  - name: sinhf
    standards:
      - stdc
--- a/libcxx/include/map
+++ b/libcxx/include/map
@ -978,11 +978,11 @@ public:

 #  ifndef _LIBCPP_CXX03_LANG

-  _LIBCPP_HIDE_FROM_ABI map(map&& __m) noexcept(is_nothrow_move_constructible<__base>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI map(map&& __m) = default;

  _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a);

-  _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) noexcept(is_nothrow_move_assignable<__base>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) = default;

  _LIBCPP_HIDE_FROM_ABI map(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
      : __tree_(__vc(__comp)) {
@ -1646,12 +1646,11 @@ public:

 #  ifndef _LIBCPP_CXX03_LANG

-  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) noexcept(is_nothrow_move_constructible<__base>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) = default;

  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a);

-  _LIBCPP_HIDE_FROM_ABI multimap&
-  operator=(multimap&& __m) noexcept(is_nothrow_move_assignable<__base>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) = default;

  _LIBCPP_HIDE_FROM_ABI multimap(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
      : __tree_(__vc(__comp)) {
--- a/libcxx/include/set
+++ b/libcxx/include/set
@ -667,7 +667,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI set& operator=(const set& __s) = default;

 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI set(set&& __s) noexcept(is_nothrow_move_constructible<__base>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI set(set&& __s) = default;
 #  endif // _LIBCPP_CXX03_LANG

  _LIBCPP_HIDE_FROM_ABI explicit set(const allocator_type& __a) : __tree_(__a) {}
@ -699,10 +699,7 @@ public:
    return *this;
  }

-  _LIBCPP_HIDE_FROM_ABI set& operator=(set&& __s) noexcept(is_nothrow_move_assignable<__base>::value) {
-    __tree_ = std::move(__s.__tree_);
-    return *this;
-  }
+  _LIBCPP_HIDE_FROM_ABI set& operator=(set&& __s) = default;
 #  endif // _LIBCPP_CXX03_LANG

  _LIBCPP_HIDE_FROM_ABI ~set() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); }
@ -1126,7 +1123,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI multiset& operator=(const multiset& __s) = default;

 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) noexcept(is_nothrow_move_constructible<__base>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) = default;

  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a);
 #  endif // _LIBCPP_CXX03_LANG
@ -1158,10 +1155,7 @@ public:
    return *this;
  }

-  _LIBCPP_HIDE_FROM_ABI multiset& operator=(multiset&& __s) _NOEXCEPT_(is_nothrow_move_assignable<__base>::value) {
-    __tree_ = std::move(__s.__tree_);
-    return *this;
-  }
+  _LIBCPP_HIDE_FROM_ABI multiset& operator=(multiset&& __s) = default;
 #  endif // _LIBCPP_CXX03_LANG

  _LIBCPP_HIDE_FROM_ABI ~multiset() {
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@ -1049,8 +1049,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_map(const unordered_map& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_map(const unordered_map& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_map(unordered_map&& __u)
-      _NOEXCEPT_(is_nothrow_move_constructible<__table>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_map(unordered_map&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_map(unordered_map&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_map(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI
@ -1102,8 +1101,7 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(const unordered_map& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(unordered_map&& __u)
-      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(unordered_map&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

@ -1823,8 +1821,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(const unordered_multimap& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(const unordered_multimap& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multimap(unordered_multimap&& __u)
-      _NOEXCEPT_(is_nothrow_move_constructible<__table>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multimap(unordered_multimap&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(unordered_multimap&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(
@ -1876,8 +1873,7 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(const unordered_multimap& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(unordered_multimap&& __u)
-      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(unordered_multimap&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@ -706,7 +706,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_set(const unordered_set& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_set(const unordered_set& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_set(unordered_set&& __u) _NOEXCEPT_(is_nothrow_move_constructible<__table>::value);
+  _LIBCPP_HIDE_FROM_ABI unordered_set(unordered_set&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_set(unordered_set&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_set(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI
@ -735,8 +735,7 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(const unordered_set& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(unordered_set&& __u)
-      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(unordered_set&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

@ -1076,11 +1075,6 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(const unordered_set&

 #  ifndef _LIBCPP_CXX03_LANG

-template <class _Value, class _Hash, class _Pred, class _Alloc>
-inline unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(unordered_set&& __u)
-    _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
-    : __table_(std::move(__u.__table_)) {}
-
 template <class _Value, class _Hash, class _Pred, class _Alloc>
 unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(unordered_set&& __u, const allocator_type& __a)
    : __table_(std::move(__u.__table_), __a) {
@ -1294,8 +1288,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(const unordered_multiset& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(const unordered_multiset& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multiset(unordered_multiset&& __u)
-      _NOEXCEPT_(is_nothrow_move_constructible<__table>::value);
+  _LIBCPP_HIDE_FROM_ABI unordered_multiset(unordered_multiset&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(unordered_multiset&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(
@ -1324,8 +1317,7 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(const unordered_multiset& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(unordered_multiset&& __u)
-      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(unordered_multiset&& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

@ -1675,11 +1667,6 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(

 #  ifndef _LIBCPP_CXX03_LANG

-template <class _Value, class _Hash, class _Pred, class _Alloc>
-inline unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(unordered_multiset&& __u)
-    _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
-    : __table_(std::move(__u.__table_)) {}
-
 template <class _Value, class _Hash, class _Pred, class _Alloc>
 unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
    unordered_multiset&& __u, const allocator_type& __a)
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@ -1399,6 +1399,7 @@ void SymbolTable::resolveAlternateNames() {
      auto toUndef = dyn_cast<Undefined>(toSym);
      if (toUndef && (!toUndef->weakAlias || toUndef->isAntiDep))
        continue;
+      toSym->isUsedInRegularObj = true;
      if (toSym->isLazy())
        forceLazy(toSym);
      u->setWeakAlias(toSym);
--- a/lld/test/COFF/alternatename-lto.ll
+++ b/lld/test/COFF/alternatename-lto.ll
@ -0,0 +1,25 @@
+; REQUIRES: x86
+; RUN: mkdir -p %t.dir
+; RUN: llvm-as -o %t.obj %s
+; RUN: lld-link -out:%t.dll -dll -noentry %t.obj -export:test
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-windows-msvc19.33.0"
+
+$alt = comdat any
+
+@alt = weak_odr dso_local global i32 0, comdat, align 4
+@ext = external dso_local global i32, align 4
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @test() #0 {
+entry:
+  %0 = load i32, ptr @ext, align 4
+  ret i32 %0
+}
+
+attributes #0 = { noinline nounwind optnone uwtable }
+
+!llvm.linker.options = !{!0}
+
+!0 = !{!"/alternatename:ext=alt"}
--- a/lldb/source/Core/Value.cpp
+++ b/lldb/source/Core/Value.cpp
@ -347,6 +347,9 @@ Status Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
    else
      data.SetAddressByteSize(sizeof(void *));

+    if (!type_size)
+      return Status::FromErrorString("type does not have a size");
+
    uint32_t result_byte_size = *type_size;
    if (m_value.GetData(data, result_byte_size))
      return error; // Success;
--- a/lldb/unittests/Core/CMakeLists.txt
+++ b/lldb/unittests/Core/CMakeLists.txt
@ -15,6 +15,7 @@ add_lldb_unittest(LLDBCoreTests
  SourceManagerTest.cpp
  TelemetryTest.cpp
  UniqueCStringMapTest.cpp
+  Value.cpp

  LINK_COMPONENTS
    Support
--- a/lldb/unittests/Core/Value.cpp
+++ b/lldb/unittests/Core/Value.cpp
@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/Value.h"
+#include "Plugins/Platform/MacOSX/PlatformMacOSX.h"
+#include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
+#include "TestingSupport/SubsystemRAII.h"
+#include "TestingSupport/Symbol/ClangTestUtils.h"
+
+#include "lldb/Utility/DataExtractor.h"
+
+#include "gtest/gtest.h"
+
+using namespace lldb_private;
+using namespace lldb_private::clang_utils;
+
+TEST(ValueTest, GetValueAsData) {
+  SubsystemRAII<FileSystem, HostInfo, PlatformMacOSX> subsystems;
+  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("test");
+  auto *clang = holder->GetAST();
+
+  Value v(Scalar(42));
+  DataExtractor extractor;
+
+  // no compiler type
+  Status status = v.GetValueAsData(nullptr, extractor, nullptr);
+  ASSERT_TRUE(status.Fail());
+
+  // with compiler type
+  v.SetCompilerType(clang->GetBasicType(lldb::BasicType::eBasicTypeChar));
+
+  status = v.GetValueAsData(nullptr, extractor, nullptr);
+  ASSERT_TRUE(status.Success());
+}
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -18983,7 +18983,9 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
    // single-step fp_round we want to fold to.
    // In other words, double rounding isn't the same as rounding.
    // Also, this is a value preserving truncation iff both fp_round's are.
-    if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc)
+    if ((N->getFlags().hasAllowContract() &&
+         N0->getFlags().hasAllowContract()) ||
+        N0IsTrunc)
      return DAG.getNode(
          ISD::FP_ROUND, DL, VT, N0.getOperand(0),
          DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL, /*isTarget=*/true));
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -18,6 +18,7 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@ -26,6 +27,7 @@
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/KnownFPClass.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -27,6 +27,7 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"

 #ifdef EXPENSIVE_CHECKS
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -6135,6 +6135,19 @@ unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
  }
 }

+bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32:
+    return false;
+  }
+  return TargetLowering::canCreateUndefOrPoisonForTargetNode(
+      Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
+}
+
 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
    unsigned Depth) const {
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -323,6 +323,12 @@ public:
                                            const MachineRegisterInfo &MRI,
                                            unsigned Depth = 0) const override;

+  bool canCreateUndefOrPoisonForTargetNode(SDValue Op,
+                                           const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
+                                           bool PoisonOnly, bool ConsiderFlags,
+                                           unsigned Depth) const override;
+
  bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts,
                                    const SelectionDAG &DAG, bool SNaN = false,
                                    unsigned Depth = 0) const override;
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@ -21,6 +21,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/Local.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@ -225,6 +225,7 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/AttributeMask.h"
@ -243,6 +244,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Alignment.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@ -190,12 +190,14 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/ReplaceConstant.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@ -22,7 +22,6 @@
 namespace llvm {
 class AsmPrinter;
 class MCContext;
-} // namespace llvm

 class AMDGPUMCInstLower {
  MCContext &Ctx;
@ -66,4 +65,5 @@ static inline const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
  return nullptr;
 }
 } // namespace
+} // namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -90,6 +90,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
@ -125,6 +126,44 @@ using namespace llvm;
 using namespace llvm::PatternMatch;

 namespace {
+//===----------------------------------------------------------------------===//
+// AMDGPU CodeGen Pass Builder interface.
+//===----------------------------------------------------------------------===//
+
+class AMDGPUCodeGenPassBuilder
+    : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
+  using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
+
+public:
+  AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
+                           const CGPassBuilderOption &Opts,
+                           PassInstrumentationCallbacks *PIC);
+
+  void addIRPasses(AddIRPass &) const;
+  void addCodeGenPrepare(AddIRPass &) const;
+  void addPreISel(AddIRPass &addPass) const;
+  void addILPOpts(AddMachinePass &) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+  void addPreRewrite(AddMachinePass &) const;
+  void addMachineSSAOptimization(AddMachinePass &) const;
+  void addPostRegAlloc(AddMachinePass &) const;
+  void addPreEmitPass(AddMachinePass &) const;
+  void addPreEmitRegAlloc(AddMachinePass &) const;
+  Error addRegAssignmentOptimized(AddMachinePass &) const;
+  void addPreRegAlloc(AddMachinePass &) const;
+  void addOptimizedRegAlloc(AddMachinePass &) const;
+  void addPreSched2(AddMachinePass &) const;
+
+  /// Check if a pass is enabled given \p Opt option. The option always
+  /// overrides defaults if explicitly used. Otherwise its default will be used
+  /// given that a pass shall work at an optimization \p Level minimum.
+  bool isPassEnabled(const cl::opt<bool> &Opt,
+                     CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
+  void addEarlyCSEOrGVNPass(AddIRPass &) const;
+  void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
+};
+
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
 public:
  SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@ -18,7 +18,6 @@
 #include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Passes/CodeGenPassBuilder.h"
 #include <optional>
 #include <utility>

@ -158,44 +157,6 @@ public:
  }
 };

-//===----------------------------------------------------------------------===//
-// AMDGPU CodeGen Pass Builder interface.
-//===----------------------------------------------------------------------===//
-
-class AMDGPUCodeGenPassBuilder
-    : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
-  using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
-
-public:
-  AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
-                           const CGPassBuilderOption &Opts,
-                           PassInstrumentationCallbacks *PIC);
-
-  void addIRPasses(AddIRPass &) const;
-  void addCodeGenPrepare(AddIRPass &) const;
-  void addPreISel(AddIRPass &addPass) const;
-  void addILPOpts(AddMachinePass &) const;
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
-  Error addInstSelector(AddMachinePass &) const;
-  void addPreRewrite(AddMachinePass &) const;
-  void addMachineSSAOptimization(AddMachinePass &) const;
-  void addPostRegAlloc(AddMachinePass &) const;
-  void addPreEmitPass(AddMachinePass &) const;
-  void addPreEmitRegAlloc(AddMachinePass &) const;
-  Error addRegAssignmentOptimized(AddMachinePass &) const;
-  void addPreRegAlloc(AddMachinePass &) const;
-  void addOptimizedRegAlloc(AddMachinePass &) const;
-  void addPreSched2(AddMachinePass &) const;
-
-  /// Check if a pass is enabled given \p Opt option. The option always
-  /// overrides defaults if explicitly used. Otherwise its default will be used
-  /// given that a pass shall work at an optimization \p Level minimum.
-  bool isPassEnabled(const cl::opt<bool> &Opt,
-                     CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
-  void addEarlyCSEOrGVNPass(AddIRPass &) const;
-  void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
-};
-
 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@ -20,6 +20,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"

+using namespace llvm;
+
 namespace {
 class R600MCInstLower : public AMDGPUMCInstLower {
 public:
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@ -19,6 +19,7 @@
 #include "R600MachineFunctionInfo.h"
 #include "R600MachineScheduler.h"
 #include "R600TargetTransformInfo.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>

@ -46,6 +47,21 @@ static MachineSchedRegistry R600SchedRegistry("r600",
                                              "Run R600's custom scheduler",
                                              createR600MachineScheduler);

+//===----------------------------------------------------------------------===//
+// R600 CodeGen Pass Builder interface.
+//===----------------------------------------------------------------------===//
+
+class R600CodeGenPassBuilder
+    : public CodeGenPassBuilder<R600CodeGenPassBuilder, R600TargetMachine> {
+public:
+  R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
+                         PassInstrumentationCallbacks *PIC);
+
+  void addPreISel(AddIRPass &addPass) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+};
+
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@ -57,21 +57,6 @@ public:
  createMachineScheduler(MachineSchedContext *C) const override;
 };

-//===----------------------------------------------------------------------===//
-// R600 CodeGen Pass Builder interface.
-//===----------------------------------------------------------------------===//
-
-class R600CodeGenPassBuilder
-    : public CodeGenPassBuilder<R600CodeGenPassBuilder, R600TargetMachine> {
-public:
-  R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
-                         PassInstrumentationCallbacks *PIC);
-
-  void addPreISel(AddIRPass &addPass) const;
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
-  Error addInstSelector(AddMachinePass &) const;
-};
-
 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@ -16,12 +16,14 @@
 #include "GCNSubtarget.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -32,11 +32,15 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/TargetParser/TargetParser.h"
+
 using namespace llvm;

 #define DEBUG_TYPE "si-insert-waitcnts"
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@ -17,6 +17,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/InitializePasses.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/InitializePasses.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@ -80,6 +80,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/Dominators.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"

 using namespace llvm;

--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@ -2073,15 +2073,23 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
  }

  // Replace (inttoptr (add (ptrtoint %Base), %Offset)) with
-  // (getelementptr i8, %Base, %Offset) if all users are ICmps.
+  // (getelementptr i8, %Base, %Offset) if the pointer is only used as integer
+  // value.
  Value *Base;
  Value *Offset;
+  auto UsesPointerAsInt = [](User *U) {
+    if (isa<ICmpInst, PtrToIntInst>(U))
+      return true;
+    if (auto *P = dyn_cast<PHINode>(U))
+      return P->hasOneUse() && isa<ICmpInst, PtrToIntInst>(*P->user_begin());
+    return false;
+  };
  if (match(CI.getOperand(0),
            m_OneUse(m_c_Add(m_PtrToIntSameSize(DL, m_Value(Base)),
                             m_Value(Offset)))) &&
      CI.getType()->getPointerAddressSpace() ==
          Base->getType()->getPointerAddressSpace() &&
-      all_of(CI.users(), IsaPred<ICmpInst>)) {
+      all_of(CI.users(), UsesPointerAsInt)) {
    return GetElementPtrInst::Create(Builder.getInt8Ty(), Base, Offset);
  }

--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@ -642,6 +642,13 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {

    Instruction *Ext = I->clone();
    Ext->setOperand(0, Current);
+    // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so
+    // we assume that it is ok to redistribute trunc over add/sub/or. But for
+    // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc
+    // nuw (add A, B))). To make such redistributions legal we drop all the
+    // poison generating flags from cloned trunc instructions here.
+    if (isa<TruncInst>(Ext))
+      Ext->dropPoisonGeneratingFlags();
    Ext->insertBefore(*IP->getParent(), IP);
    Current = Ext;
  }
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@ -153,11 +153,7 @@ template <typename LTy, typename RTy> struct match_combine_or {
  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}

  template <typename ITy> bool match(ITy *V) const {
-    if (L.match(V))
-      return true;
-    if (R.match(V))
-      return true;
-    return false;
+    return L.match(V) || R.match(V);
  }
 };

--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.cvt = fptrunc float %a to bfloat
@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.abs = call float @llvm.fabs.f32(float %a)
@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.neg = fneg float %a
@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
 ; GFX1250-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.cvt = fptrunc double %a to bfloat
@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.neg = fneg double %a
@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.abs = call double @llvm.fabs.f64(double %a)
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@ -1582,28 +1582,22 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
 ; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s2, v5
-; SI-NEXT:    s_bfe_u32 s0, s2, 0xb0014
-; SI-NEXT:    s_add_i32 s3, s0, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s1, 0xfffff
-; SI-NEXT:    s_mov_b32 s0, s6
-; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s3
-; SI-NEXT:    v_not_b32_e32 v6, s0
-; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, s1
-; SI-NEXT:    v_and_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s0, s2, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s3, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, s0
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    s_cmp_gt_i32 s3, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s2
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s0, v4
+; SI-NEXT:    v_readfirstlane_b32 s1, v5
+; SI-NEXT:    s_bfe_u32 s2, s1, 0xb0014
+; SI-NEXT:    s_add_i32 s8, s2, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s3, 0xfffff
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
+; SI-NEXT:    s_andn2_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b32 s9, s1, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s8, 0
+; SI-NEXT:    s_cselect_b32 s2, 0, s2
+; SI-NEXT:    s_cselect_b32 s3, s9, s3
+; SI-NEXT:    s_cmp_gt_i32 s8, 51
+; SI-NEXT:    s_cselect_b32 s1, s1, s3
+; SI-NEXT:    s_cselect_b32 s0, s0, s2
+; SI-NEXT:    v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@ -1859,28 +1853,22 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
 ; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_readfirstlane_b32 s6, v5
-; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, s2
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
-; SI-NEXT:    v_not_b32_e32 v6, s4
-; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, s5
-; SI-NEXT:    v_and_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s6
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s4, v4
+; SI-NEXT:    v_readfirstlane_b32 s5, v5
+; SI-NEXT:    s_bfe_u32 s6, s5, 0xb0014
+; SI-NEXT:    s_add_i32 s8, s6, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s7, 0xfffff
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
+; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[6:7]
+; SI-NEXT:    s_and_b32 s9, s5, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s8, 0
+; SI-NEXT:    s_cselect_b32 s6, 0, s6
+; SI-NEXT:    s_cselect_b32 s7, s9, s7
+; SI-NEXT:    s_cmp_gt_i32 s8, 51
+; SI-NEXT:    s_cselect_b32 s5, s5, s7
+; SI-NEXT:    s_cselect_b32 s4, s4, s6
+; SI-NEXT:    v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@ -2109,28 +2097,22 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
 ; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_readfirstlane_b32 s6, v5
-; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
-; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s5, 0xfffff
-; SI-NEXT:    s_mov_b32 s4, s2
-; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
-; SI-NEXT:    v_not_b32_e32 v6, s4
-; SI-NEXT:    v_and_b32_e32 v6, v4, v6
-; SI-NEXT:    v_not_b32_e32 v7, s5
-; SI-NEXT:    v_and_b32_e32 v5, v5, v7
-; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s7, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v7, s4
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    s_cmp_gt_i32 s7, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v7, s6
-; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s4, v4
+; SI-NEXT:    v_readfirstlane_b32 s5, v5
+; SI-NEXT:    s_bfe_u32 s6, s5, 0xb0014
+; SI-NEXT:    s_add_i32 s8, s6, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s7, 0xfffff
+; SI-NEXT:    s_mov_b32 s6, s2
+; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
+; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[6:7]
+; SI-NEXT:    s_and_b32 s9, s5, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s8, 0
+; SI-NEXT:    s_cselect_b32 s6, 0, s6
+; SI-NEXT:    s_cselect_b32 s7, s9, s7
+; SI-NEXT:    s_cmp_gt_i32 s8, 51
+; SI-NEXT:    s_cselect_b32 s5, s5, s7
+; SI-NEXT:    s_cselect_b32 s4, s4, s6
+; SI-NEXT:    v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@ -5251,27 +5233,22 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
 ; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; SI-NEXT:    v_readfirstlane_b32 s8, v9
-; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
-; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
+; SI-NEXT:    v_readfirstlane_b32 s0, v8
+; SI-NEXT:    v_readfirstlane_b32 s1, v9
+; SI-NEXT:    s_bfe_u32 s2, s1, 0xb0014
+; SI-NEXT:    s_add_i32 s10, s2, 0xfffffc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
-; SI-NEXT:    v_not_b32_e32 v10, s0
-; SI-NEXT:    v_and_b32_e32 v10, v8, v10
-; SI-NEXT:    v_not_b32_e32 v11, s1
-; SI-NEXT:    v_and_b32_e32 v9, v9, v11
-; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s9, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v11, s0
-; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
-; SI-NEXT:    s_cmp_gt_i32 s9, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v11, s8
-; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
-; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; SI-NEXT:    s_mov_b32 s2, s6
+; SI-NEXT:    s_lshr_b64 s[8:9], s[2:3], s10
+; SI-NEXT:    s_andn2_b64 s[8:9], s[0:1], s[8:9]
+; SI-NEXT:    s_and_b32 s11, s1, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s10, 0
+; SI-NEXT:    s_cselect_b32 s8, 0, s8
+; SI-NEXT:    s_cselect_b32 s9, s11, s9
+; SI-NEXT:    s_cmp_gt_i32 s10, 51
+; SI-NEXT:    s_cselect_b32 s1, s1, s9
+; SI-NEXT:    s_cselect_b32 s0, s0, s8
+; SI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[6:7], v[2:3]
 ; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
 ; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
 ; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
@ -5287,26 +5264,20 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
 ; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s8, v7
-; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
-; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
-; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
-; SI-NEXT:    v_not_b32_e32 v8, s0
-; SI-NEXT:    v_and_b32_e32 v8, v6, v8
-; SI-NEXT:    v_not_b32_e32 v9, s1
-; SI-NEXT:    v_and_b32_e32 v7, v7, v9
-; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s9, 0
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
-; SI-NEXT:    v_mov_b32_e32 v9, s0
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    s_cmp_gt_i32 s9, 51
-; SI-NEXT:    s_cselect_b64 vcc, -1, 0
-; SI-NEXT:    v_mov_b32_e32 v9, s8
-; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
-; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s0, v6
+; SI-NEXT:    v_readfirstlane_b32 s1, v7
+; SI-NEXT:    s_bfe_u32 s8, s1, 0xb0014
+; SI-NEXT:    s_addk_i32 s8, 0xfc01
+; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
+; SI-NEXT:    s_andn2_b64 s[2:3], s[0:1], s[2:3]
+; SI-NEXT:    s_and_b32 s9, s1, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s8, 0
+; SI-NEXT:    s_cselect_b32 s2, 0, s2
+; SI-NEXT:    s_cselect_b32 s3, s9, s3
+; SI-NEXT:    s_cmp_gt_i32 s8, 51
+; SI-NEXT:    s_cselect_b32 s1, s1, s3
+; SI-NEXT:    s_cselect_b32 s0, s0, s2
+; SI-NEXT:    v_fma_f64 v[0:1], -s[0:1], v[4:5], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@ -18,6 +18,16 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -74,6 +84,16 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_cube:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_cube:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -132,6 +152,16 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_2darray:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_2darray:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -190,6 +220,16 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_c_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_c_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -246,6 +286,16 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -306,6 +356,16 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_c_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_c_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -362,6 +422,16 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_b_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_b_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -418,6 +488,16 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_c_b_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_c_b_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -476,6 +556,16 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_b_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_b_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -538,6 +628,16 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_c_b_cl_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    s_mov_b32 s12, exec_lo
+; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
+; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
+; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_c_b_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -591,6 +691,13 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_l_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_l_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
@ -636,6 +743,13 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_c_l_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_c_l_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
@ -677,6 +791,13 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_lz_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX10-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_lz_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
@ -718,6 +839,13 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
+; GFX10-LABEL: gather4_c_lz_2d:
+; GFX10:       ; %bb.0: ; %main_body
+; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
+; GFX10-NEXT:    image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    ; return to shader part epilog
+;
 ; GFX11-TRUE16-LABEL: gather4_c_lz_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
@ -773,5 +901,4 @@ attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
 attributes #2 = { nounwind readnone }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10: {{.*}}
 ; GFX12: {{.*}}
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@ -76,12 +76,13 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    s_movk_i32 s4, 0xfc01
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT:    v_add_i32_e32 v6, vcc, 0xfffffc01, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
 ; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
 ; SI-NEXT:    v_not_b32_e32 v5, v5
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_and_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_or_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_smax_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_smin_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvmin.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.d $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_umax_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvmax.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvmax.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_umin_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvmin.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvmin.du $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll
@ -0,0 +1,95 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
+
+define void @vec_reduce_xor_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <32 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a0, 0
+; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
+; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
+; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
+; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
+; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
+
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_and_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_and_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_and_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_or_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_or_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_or_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_smax_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smax_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smax_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_smin_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_smin_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_smin_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.d $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_umax_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umax_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umax_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vmax.du $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umin.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_umin_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.bu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.hu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vmin.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.wu $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_umin_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_umin_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vmin.du $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-xor.ll
@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
+
+define void @vec_reduce_xor_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <16 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v8i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v4i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v2i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.h $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i8>, ptr %src
+  %res = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v)
+  store i8 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <8 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v4i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v2i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.w $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i16>, ptr %src
+  %res = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v)
+  store i16 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <4 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v2i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ld.d $a0, $a0, 0
+; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i32>, ptr %src
+  %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v)
+  store i32 %res, ptr %dst
+  ret void
+}
+
+define void @vec_reduce_xor_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: vec_reduce_xor_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a0, 0
+; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
+; CHECK-NEXT:    vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
+; CHECK-NEXT:    ret
+  %v = load <2 x i64>, ptr %src
+  %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v)
+  store i64 %res, ptr %dst
+  ret void
+}
--- a/llvm/test/CodeGen/X86/fp-double-rounding.ll
+++ b/llvm/test/CodeGen/X86/fp-double-rounding.ll
@ -1,20 +1,53 @@
-; RUN: llc < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
-; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
+; RUN: llc < %s | FileCheck %s

 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--"

+; CHECK-LABEL: double_rounding_safe:
+; CHECK: callq __trunctfdf2
+; CHECK-NEXT: cvtsd2ss %xmm0
+define void @double_rounding_safe(ptr %x, ptr %f) {
+entry:
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc fp128 %x.fp128 to double
+  %x.float = fptrunc double %x.double to float
+  store float %x.float, ptr %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: double_rounding_contract_fst:
+; CHECK: callq __trunctfdf2
+; CHECK-NEXT: cvtsd2ss %xmm0
+define void @double_rounding_contract_fst(ptr %x, ptr %f) {
+entry:
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc contract fp128 %x.fp128 to double
+  %x.float = fptrunc double %x.double to float
+  store float %x.float, ptr %f, align 4
+  ret void
+}
+
+; CHECK-LABEL: double_rounding_contract_snd:
+; CHECK: callq __trunctfdf2
+; CHECK-NEXT: cvtsd2ss %xmm0
+define void @double_rounding_contract_snd(ptr %x, ptr %f) {
+entry:
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc fp128 %x.fp128 to double
+  %x.float = fptrunc contract double %x.double to float
+  store float %x.float, ptr %f, align 4
+  ret void
+}
+
 ; CHECK-LABEL: double_rounding:
-; SAFE: callq __trunctfdf2
-; SAFE-NEXT: cvtsd2ss %xmm0
-; UNSAFE: callq __trunctfsf2
-; UNSAFE-NOT: cvt
+; CHECK: callq __trunctfsf2
+; CHECK-NOT: cvt
 define void @double_rounding(ptr %x, ptr %f) {
 entry:
-  %0 = load fp128, ptr %x, align 16
-  %1 = fptrunc fp128 %0 to double
-  %2 = fptrunc double %1 to float
-  store float %2, ptr %f, align 4
+  %x.fp128 = load fp128, ptr %x, align 16
+  %x.double = fptrunc contract fp128 %x.fp128 to double
+  %x.float = fptrunc contract double %x.double to float
+  store float %x.float, ptr %f, align 4
  ret void
 }

--- a/llvm/test/Transforms/InstCombine/inttoptr-add-phi.ll
+++ b/llvm/test/Transforms/InstCombine/inttoptr-add-phi.ll
@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S %s | FileCheck %s
+
+define i64 @inttoptr_used_by_phi_with_ptrtoint(i1 %c, ptr %src, ptr %p2) {
+; CHECK-LABEL: define i64 @inttoptr_used_by_phi_with_ptrtoint(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[SRC:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[SRC]], i64 10
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[TMP1]], %[[THEN]] ], [ 0, %[[ELSE]] ]
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  br i1 %c, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %phi = phi ptr [ %p, %then ], [ null, %else ]
+  %i.2 = ptrtoint ptr %phi to i64
+  ret i64 %i.2
+}
+
+declare void @foo(ptr)
+
+define i64 @inttoptr_used_by_phi_with_other_users(i1 %c, ptr %src, ptr %p2) {
+; CHECK-LABEL: define i64 @inttoptr_used_by_phi_with_other_users(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[SRC:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:    [[I:%.*]] = ptrtoint ptr [[SRC]] to i64
+; CHECK-NEXT:    [[A:%.*]] = add i64 [[I]], 10
+; CHECK-NEXT:    br i1 [[C]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[A]], %[[THEN]] ], [ 0, %[[ELSE]] ]
+; CHECK-NEXT:    [[P:%.*]] = inttoptr i64 [[A]] to ptr
+; CHECK-NEXT:    call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+  %i = ptrtoint ptr %src to i64
+  %a = add i64 %i, 10
+  %p = inttoptr i64 %a to ptr
+  br i1 %c, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  br label %exit
+
+exit:
+  %phi = phi ptr [ %p, %then ], [ null, %else ]
+  call void @foo(ptr %p)
+  %i.2 = ptrtoint ptr %phi to i64
+  ret i64 %i.2
+}
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@ -9,27 +9,34 @@ define i64 @std_find_i16_constant_offset_with_assumptions(ptr %first.coerce, i16
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST_COERCE]], i64 2) ]
 ; CHECK-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[FIRST_COERCE]], i64 256) ]
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FIRST_COERCE]] to i64
-; CHECK-NEXT:    [[COERCE_VAL_PI_I:%.*]] = add i64 [[TMP0]], 256
-; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[COERCE_VAL_PI_I]] to ptr
-; CHECK-NEXT:    [[CMP_NOT6_I_I:%.*]] = icmp eq ptr [[FIRST_COERCE]], [[COERCE_VAL_IP]]
-; CHECK-NEXT:    br i1 [[CMP_NOT6_I_I]], label %[[RETURN:.*]], label %[[LOOP_HEADER:.*]]
-; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST_COERCE]], %[[ENTRY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PTR_IV]], align 2
-; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp eq i16 [[TMP1]], [[S]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label %[[RETURN_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
-; CHECK:       [[LOOP_LATCH]]:
-; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 2
-; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[COERCE_VAL_IP]]
-; CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[RETURN_LOOPEXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[RETURN_LOOPEXIT]]:
-; CHECK-NEXT:    [[MERGE_PH:%.*]] = phi ptr [ [[COERCE_VAL_IP]], %[[LOOP_LATCH]] ], [ [[PTR_IV]], %[[LOOP_HEADER]] ]
-; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[MERGE_PH]] to i64
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[S]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i1> [[TMP0]] to i8
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_SPLIT]]:
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[RETURN:.*]]
+; CHECK:       [[VECTOR_EARLY_EXIT]]:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP0]], i1 true)
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 [[TMP7]]
 ; CHECK-NEXT:    br label %[[RETURN]]
 ; CHECK:       [[RETURN]]:
-; CHECK-NEXT:    [[RES_PRE_PHI:%.*]] = phi i64 [ [[DOTPRE]], %[[RETURN_LOOPEXIT]] ], [ [[TMP0]], %[[ENTRY]] ]
-; CHECK-NEXT:    ret i64 [[RES_PRE_PHI]]
+; CHECK-NEXT:    [[__FIRST_ADDR_0_LCSSA_I_I_PH:%.*]] = phi ptr [ [[TMP8]], %[[VECTOR_EARLY_EXIT]] ], [ [[COERCE_VAL_IP]], %[[MIDDLE_SPLIT]] ]
+; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[__FIRST_ADDR_0_LCSSA_I_I_PH]] to i64
+; CHECK-NEXT:    ret i64 [[DOTPRE]]
 ;
 entry:
  %first = alloca { ptr }, align 8
@ -71,27 +78,21 @@ define i64 @std_find_i16_constant_offset_no_assumptions(ptr %first.coerce, i16 n
 ; CHECK-LABEL: define i64 @std_find_i16_constant_offset_no_assumptions(
 ; CHECK-SAME: ptr [[FIRST_COERCE:%.*]], i16 noundef signext [[S:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = ptrtoint ptr [[FIRST_COERCE]] to i64
-; CHECK-NEXT:    [[COERCE_VAL_PI_I:%.*]] = add i64 [[TMP0]], 256
-; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = inttoptr i64 [[COERCE_VAL_PI_I]] to ptr
-; CHECK-NEXT:    [[CMP_NOT6_I_I:%.*]] = icmp eq ptr [[FIRST_COERCE]], [[COERCE_VAL_IP]]
-; CHECK-NEXT:    br i1 [[CMP_NOT6_I_I]], label %[[RETURN:.*]], label %[[LOOP_HEADER:.*]]
+; CHECK-NEXT:    [[COERCE_VAL_IP:%.*]] = getelementptr i8, ptr [[FIRST_COERCE]], i64 256
+; CHECK-NEXT:    br label %[[LOOP_HEADER:.*]]
 ; CHECK:       [[LOOP_HEADER]]:
-; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[FIRST_COERCE]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[FIRST_COERCE]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PTR_IV]], align 2
 ; CHECK-NEXT:    [[CMP2_I_I:%.*]] = icmp eq i16 [[TMP1]], [[S]]
-; CHECK-NEXT:    br i1 [[CMP2_I_I]], label %[[RETURN_LOOPEXIT:.*]], label %[[LOOP_LATCH]]
+; CHECK-NEXT:    br i1 [[CMP2_I_I]], label %[[RETURN:.*]], label %[[LOOP_LATCH]]
 ; CHECK:       [[LOOP_LATCH]]:
 ; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr inbounds nuw i8, ptr [[PTR_IV]], i64 2
 ; CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[COERCE_VAL_IP]]
-; CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[RETURN_LOOPEXIT]], label %[[LOOP_HEADER]]
-; CHECK:       [[RETURN_LOOPEXIT]]:
+; CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label %[[RETURN]], label %[[LOOP_HEADER]]
+; CHECK:       [[RETURN]]:
 ; CHECK-NEXT:    [[MERGE_PH:%.*]] = phi ptr [ [[COERCE_VAL_IP]], %[[LOOP_LATCH]] ], [ [[PTR_IV]], %[[LOOP_HEADER]] ]
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = ptrtoint ptr [[MERGE_PH]] to i64
-; CHECK-NEXT:    br label %[[RETURN]]
-; CHECK:       [[RETURN]]:
-; CHECK-NEXT:    [[RES_PRE_PHI:%.*]] = phi i64 [ [[DOTPRE]], %[[RETURN_LOOPEXIT]] ], [ [[TMP0]], %[[ENTRY]] ]
-; CHECK-NEXT:    ret i64 [[RES_PRE_PHI]]
+; CHECK-NEXT:    ret i64 [[DOTPRE]]
 ;
 entry:
  %first = alloca { ptr }, align 8
@ -128,3 +129,8 @@ return:
 }

 declare void @llvm.assume(i1 noundef)
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/preserve-inbounds.ll
@ -311,7 +311,7 @@ entry:
 define ptr @nuw_inbounds_implies_nuw_inbounds_trunc_nuw(ptr %p, i128 %i) {
 ; CHECK-LABEL: @nuw_inbounds_implies_nuw_inbounds_trunc_nuw(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc nuw i128 [[I:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i128 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i32, ptr [[P:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/rebuild-trunc.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/rebuild-trunc.ll
@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=separate-const-offset-from-gep -S | FileCheck %s
+
+; Verify that we drop "nuw" from trunc.
+define ptr @pr154116_nuw(ptr %p, i128 %i) {
+; CHECK-LABEL: define ptr @pr154116_nuw(
+; CHECK-SAME: ptr [[P:%.*]], i128 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i128 [[I]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 80
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+  %idx = add i128 %i, 20
+  %idx.conv = trunc nuw i128 %idx to i64
+  %arrayidx = getelementptr i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
+
+; Verify that we drop "nsw" from trunc.
+define ptr @pr154116_nsw(ptr %p, i128 %i) {
+; CHECK-LABEL: define ptr @pr154116_nsw(
+; CHECK-SAME: ptr [[P:%.*]], i128 [[I:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i128 [[I]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    ret ptr [[ARRAYIDX2]]
+;
+  %idx = add i128 %i, 1
+  %idx.conv = trunc nsw i128 %idx to i64
+  %arrayidx = getelementptr i32, ptr %p, i64 %idx.conv
+  ret ptr %arrayidx
+}
--- a/llvm/unittests/Target/AMDGPU/CSETest.cpp
+++ b/llvm/unittests/Target/AMDGPU/CSETest.cpp
@ -10,6 +10,7 @@
 #include "AMDGPUUnitTests.h"
 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
 #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "gtest/gtest.h"

 using namespace llvm;
--- a/mlir/examples/toy/Ch1/parser/AST.cpp
+++ b/mlir/examples/toy/Ch1/parser/AST.cpp
@ -120,7 +120,7 @@ void ASTDumper::dump(NumberExprAST *num) {
 ///    [ [ 1, 2 ], [ 3, 4 ] ]
 /// We print out such array with the dimensions spelled out at every level:
 ///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
-void printLitHelper(ExprAST *litOrNum) {
+static void printLitHelper(ExprAST *litOrNum) {
  // Inside a literal expression we can have either a number or another literal
  if (auto *num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
    llvm::errs() << num->getValue();
--- a/mlir/examples/toy/Ch1/toyc.cpp
+++ b/mlir/examples/toy/Ch1/toyc.cpp
@ -39,7 +39,8 @@ static cl::opt<enum Action>
               cl::values(clEnumValN(DumpAST, "ast", "output the AST dump")));

 /// Returns a Toy AST resulting from parsing the file or a nullptr on error.
-std::unique_ptr<toy::ModuleAST> parseInputFile(llvm::StringRef filename) {
+static std::unique_ptr<toy::ModuleAST>
+parseInputFile(llvm::StringRef filename) {
  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> fileOrErr =
      llvm::MemoryBuffer::getFileOrSTDIN(filename);
  if (std::error_code ec = fileOrErr.getError()) {
--- a/mlir/examples/toy/Ch2/parser/AST.cpp
+++ b/mlir/examples/toy/Ch2/parser/AST.cpp
@ -120,7 +120,7 @@ void ASTDumper::dump(NumberExprAST *num) {
 ///    [ [ 1, 2 ], [ 3, 4 ] ]
 /// We print out such array with the dimensions spelled out at every level:
 ///    <2,2>[<2>[ 1, 2 ], <2>[ 3, 4 ] ]
-void printLitHelper(ExprAST *litOrNum) {
+static void printLitHelper(ExprAST *litOrNum) {
  // Inside a literal expression we can have either a number or another literal
  if (auto *num = llvm::dyn_cast<NumberExprAST>(litOrNum)) {
    llvm::errs() << num->getValue();
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Pilgrim	2a79ef66eb	[AMDGPU] canCreateUndefOrPoisonForTargetNode - BFE_I32/U32 can't create poison/undef (#154932 ) Add AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode handler and tag BFE_I32/U32 nodes as they can only propagate poison, not create poison/undef. Fighting some of the remaining regressions in #152107	2025-08-22 12:14:45 +00:00
Jungwook Park	b149fc7755	[mlir][scf] Quick fix to scf.execute_region no_inline (#154931 ) Asm printer should exclude `no_inline` attr during printing optional attrs at the bottom.	2025-08-22 13:11:27 +01:00
Michael Halkenhäuser	7c1d2467f1	Reland: [OpenMP] Add ompTest library to OpenMP (#154786 ) Reland of https://github.com/llvm/llvm-project/pull/147381 Added changes to fix observed BuildBot failures: * CMake version (reduced minimum to `3.20`, was: `3.22`) * GoogleTest linking (missing `./build/lib/libllvm_gtest.a`) * Related header issue (missing `#include "llvm/Support/raw_os_ostream.h"`) Original message Description =========== OpenMP Tooling Interface Testing Library (ompTest) ompTest is a unit testing framework for testing OpenMP implementations. It offers a simple-to-use framework that allows a tester to check for OMPT events in addition to regular unit testing code, supported by linking against GoogleTest by default. It also facilitates writing concise tests while bridging the semantic gap between the unit under test and the OMPT-event testing. Background ========== This library has been developed to provide the means of testing OMPT implementations with reasonable effort. Especially, asynchronous or unordered events are supported and can be verified with ease, which may prove to be challenging with LIT-based tests. Additionally, since the assertions are part of the code being tested, ompTest can reference all corresponding variables during assertion. Basic Usage =========== OMPT event assertions are placed before the code, which shall be tested. These assertion can either be provided as one block or interleaved with the test code. There are two types of asserters: (1) sequenced "order-sensitive" and (2) set "unordered" assserters. Once the test is being run, the corresponding events are triggered by the OpenMP runtime and can be observed. Each of these observed events notifies asserters, which then determine if the test should pass or fail. Example (partial, interleaved) ============================== ```c++ int N = 100000; int a[N]; int b[N]; OMPT_ASSERT_SEQUENCE(Target, TARGET, BEGIN, 0); OMPT_ASSERT_SEQUENCE(TargetDataOp, ALLOC, N * sizeof(int)); // a ? OMPT_ASSERT_SEQUENCE(TargetDataOp, H2D, N * sizeof(int), &a); OMPT_ASSERT_SEQUENCE(TargetDataOp, ALLOC, N * sizeof(int)); // b ? OMPT_ASSERT_SEQUENCE(TargetDataOp, H2D, N * sizeof(int), &b); OMPT_ASSERT_SEQUENCE(TargetSubmit, 1); OMPT_ASSERT_SEQUENCE(TargetDataOp, D2H, N * sizeof(int), nullptr, &b); OMPT_ASSERT_SEQUENCE(TargetDataOp, D2H, N * sizeof(int), nullptr, &a); OMPT_ASSERT_SEQUENCE(TargetDataOp, DELETE); OMPT_ASSERT_SEQUENCE(TargetDataOp, DELETE); OMPT_ASSERT_SEQUENCE(Target, TARGET, END, 0); #pragma omp target parallel for { for (int j = 0; j < N; j++) a[j] = b[j]; } ``` References ========== This work has been presented at SC'24 workshops, see: https://ieeexplore.ieee.org/document/10820689 Current State and Future Work ============================= ompTest's development was mostly device-centric and aimed at OMPT device callbacks and device-side tracing. Consequentially, a substantial part of host-related events or features may not be supported in its current state. However, we are confident that the related functionality can be added and ompTest provides a general foundation for future OpenMP and especially OMPT testing. This PR will allow us to upstream the corresponding features, like OMPT device-side tracing in the future with significantly reduced risk of introducing regressions in the process. Build ===== ompTest is linked against LLVM's GoogleTest by default, but can also be built 'standalone'. Additionally, it comes with a set of unit tests, which in turn require GoogleTest (overriding a standalone build). The unit tests are added to the `check-openmp` target. Use the following parameters to perform the corresponding build: `LIBOMPTEST_BUILD_STANDALONE` (Default: ${OPENMP_STANDALONE_BUILD}) `LIBOMPTEST_BUILD_UNITTESTS` (Default: OFF) --------- Co-authored-by: Jan-Patrick Lehr <JanPatrick.Lehr@amd.com> Co-authored-by: Joachim <protze@rz.rwth-aachen.de> Co-authored-by: Joachim Jenke <jenke@itc.rwth-aachen.de>	2025-08-22 13:56:12 +02:00
Leandro Lacerda	15a192cde5	[libc] Enable double math functions on the GPU (#154857 ) This patch adds the `acos` math function to the NVPTX build. It also adds the `sincos` math function to the `math.h` header.	2025-08-22 06:52:13 -05:00
paperchalice	2014890c09	[SelectionDAG] Remove `UnsafeFPMath` in `visitFP_ROUND` (#154768 ) Remove `UnsafeFPMath` in `visitFP_ROUND` part, it blocks some bugfixes related to clang and the ultimate goal is to remove `resetTargetOptions` method in `TargetMachine`, see FIXME in `resetTargetOptions`. See also https://discourse.llvm.org/t/rfc-honor-pragmas-with-ffp-contract-fast https://discourse.llvm.org/t/allowfpopfusion-vs-sdnodeflags-hasallowcontract Now all UnsafeFPMath uses are eliminated in LLVMCodeGen	2025-08-22 19:46:33 +08:00
Simon Pilgrim	d8769bb5b7	[AMDGPU] bf16-conversions.ll - regenerate checks Reduce diffs in #152107	2025-08-22 12:20:50 +01:00
Lang Hames	3292edb7b4	[orc-rt] Add C and C++ APIs for WrapperFunctionResult. (#154927 ) orc_rt_WrapperFunctionResult is a byte-buffer with inline storage and a builtin error state. It is intended as a general purpose return type for functions that return a serialized result (e.g. for communication across ABIs or via IPC/RPC). orc_rt_WrapperFunctionResult contains a small amount of inline storage, allowing it to avoid heap-allocation for small return types (e.g. bools, chars, pointers).	2025-08-22 21:18:30 +10:00
Mehdi Amini	d2b810e24f	[MLIR] Apply clang-tidy fixes for readability-identifier-naming in DataFlowFramework.cpp (NFC)	2025-08-22 04:12:50 -07:00
Mehdi Amini	a8aacb1b66	[MLIR] Apply clang-tidy fixes for misc-use-internal-linkage in toy Tutorial (NFC)	2025-08-22 04:12:50 -07:00
Mehdi Amini	d2dee948a4	[MLIR] Improve clang-tidy script This just helping to better keep track of the failures.	2025-08-22 04:12:50 -07:00
Jacek Caban	a6fcd1a663	[LLD][COFF] Set isUsedInRegularObj for target symbols in resolveAlternateNames (#154837 ) Fixes: #154595 Prior to commit bbc8346e6bb543b0a87f52114fed7d766446bee1, this flag was set by `insert()` from `addUndefined()`. Set it explicitly now.	2025-08-22 13:05:19 +02:00
Ramkumar Ramachandra	2975e674ec	[VPlan] Improve style in match_combine_or (NFC) (#154793 )	2025-08-22 12:01:42 +01:00
Hans Wennborg	ee5367bedb	Revert "[compiler-rt]: fix CodeQL format-string warnings via explicit casts (#153843 )" It broke the build: compiler-rt/lib/hwasan/hwasan_thread.cpp:177:11: error: unknown type name 'ssize_t'; did you mean 'size_t'? 177 \| (ssize_t)unique_id_, (void )this, (void )stack_bottom(), \| ^~~~~~~ \| size_t > This change addresses CodeQL format-string warnings across multiple > sanitizer libraries by adding explicit casts to ensure that printf-style > format specifiers match the actual argument types. > > Key updates: > - Cast pointer arguments to (void*) when used with %p. > - Use appropriate integer types and specifiers (e.g., size_t -> %zu, > ssize_t -> %zd) to avoid mismatches. > - Fix format specifier mismatches across xray, memprof, lsan, hwasan, > dfsan. > > These changes are no-ops at runtime but improve type safety, silence > static analysis warnings, and reduce the risk of UB in variadic calls. This reverts commit d3d5751a39452327690b4e011a23de8327f02e86.	2025-08-22 12:50:53 +02:00
Lang Hames	d5af08a221	[orc-rt] Add inline specifier to orc_rt::make_error. (#154922 ) Prevents linker errors for duplicate definitions when make_error is used from more than one file.	2025-08-22 20:37:10 +10:00
nerix	d6fcaef281	[LLDB][Value] Require type size when reading a scalar (#153386 ) When reading a value as a scalar, the type size is required. It's returned as a `std::optional`. This optional isn't checked for scalar values, where it is unconditionally accessed. This came up in the [Shell/Process/Windows/msstl_smoke.cpp](`4e10b62442/lldb/test/Shell/Process/Windows/msstl_smoke.cpp`) test. There, LLDB breaks at the function entry, so all locals aren't initialized yet. Most values will contain garbage. The [`std::list` synthetic provider](`4e10b62442/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp (L517)`) tries to read the value using `GetData`. However, in [`ValueObject::GetData`](`4e10b62442/lldb/source/ValueObject/ValueObject.cpp (L766)`), [`ValueObjectChild::UpdateValue`](`88c993fbc5/lldb/source/ValueObject/ValueObjectChild.cpp (L102)`) fails because the parent already failed to read its data, so `m_value` won't have a compiler type, thus the size can't be read.	2025-08-22 12:26:03 +02:00
Ross Brunton	17dbb92612	[Offload][NFC] Use tablegen names rather than `name` parameter for API (#154736 )	2025-08-22 11:13:57 +01:00
tangaac	8439777131	[LoongArch] Pre-commit tests for vecreduce_and/or/... (#154879 )	2025-08-22 17:52:43 +08:00
YafetBeyene	fda24dbc16	[BOLT] Add dump-dot-func option for selective function CFG dumping (#153007 ) ## Change: * Added `--dump-dot-func` command-line option that allows users to dump CFGs only for specific functions instead of dumping all functions (the current only available option being `--dump-dot-all`) ## Usage: * Users can now specify function names or regex patterns (e.g., `--dump-dot-func=main,helper` or `--dump-dot-func="init.`") to generate .dot files only for functions of interest Aims to save time when analysing specific functions in large binaries (e.g., only dumping graphs for performance-critical functions identified through profiling) and we can now avoid reduce output clutter from generating thousands of unnecessary .dot files when analysing large binaries ## Testing The introduced test `dump-dot-func.test` confirms the new option does the following: - [x] 1. `dump-dot-func` can correctly filter a specified functions - [x] 2. Can achieve the above with regexes - [x] 3. Can do 1. with a list of functions - [x] No option specified creates no dot files - [x] Passing in a non-existent function generates no dumping messages - [x] `dump-dot-all` continues to work as expected	2025-08-22 10:51:09 +01:00
Ivan Kosarev	7594b4b8d1	[AMDGPU] Fix compilation errors.	2025-08-22 10:30:43 +01:00
Abhinav Garg	bfc16510c7	[AMDGPU] Regenerate test case to cover gfx10 check lines. (#154909 ) Check lines for GFX10 is missing in this test case. Regenerate to fix test case.	2025-08-22 15:00:28 +05:30
Nikolas Klauser	fd52f4d232	[libc++][NFC] Simplify the special member functions of the node containers (#154707 ) This patch does two things: - Remove exception specifications of `= default`ed special member functions - `= default` special member functions The first part is NFC because the explicit specification does exactly the same as the implicit specification. The second is NFC because it does exactly what the `= default`ed special member does.	2025-08-22 11:24:28 +02:00
Florian Hahn	8bc038daf2	[InstComb] Allow more user for (add (ptrtoint %B), %O) to GEP transform. (#153566 ) Generalize the logic from https://github.com/llvm/llvm-project/pull/153421 to support additional cases where the pointer is only used as integer. Alive2 Proof: https://alive2.llvm.org/ce/z/po58pP This enables vectorizing std::find for some cases, if additional assumptions are provided: https://godbolt.org/z/94oq3576E Depends on https://github.com/llvm/llvm-project/pull/15342. PR: https://github.com/llvm/llvm-project/pull/153566	2025-08-22 10:17:12 +01:00
Ivan Kosarev	faca8c9ed4	[AMDGPU][NFC] Only include CodeGenPassBuilder.h where needed. (#154769 ) Saves around 125-210 MB of compilation memory usage per source for roughly one third of our backend sources, ~60 MB on average.	2025-08-22 10:05:06 +01:00
Simon Pilgrim	1b4fe26343	[clang][x86] Add release note entries describing recent work to making SSE intrinsics generic and usable with constexpr (#154737 ) I haven't created an exhaustive list of intrinsic changes, but I suppose I could if people see a strong need for it.	2025-08-22 09:59:10 +01:00
Baranov Victor	00a405f666	[clang-tidy][NFC] Fix "llvm-prefer-static-over-anonymous-namespace" warnings 1/N (#153885 )	2025-08-22 11:54:17 +03:00
Hans Wennborg	8bf105cb01	[asan] Build the Windows runtime with /hotpatch (#154694 ) Win/ASan relies on the runtime's functions being 16-byte aligned so it can intercept them with hotpatching. This used to be true (but not guaranteed) until #149444. Passing /hotpatch will give us enough alignment and generally ensure that the functions are hotpatchable.	2025-08-22 10:40:04 +02:00
Bjorn Pettersson	2d3167f8d8	[SeparateConstOffsetFromGEP] Avoid miscompiles related to trunc nuw/nsw (#154582 ) Drop poison generating flags on trunc when distributing trunc over add/sub/or. We need to do this since for example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc nuw (add A, B))). In some situations it is pessimistic to drop the flags. Such as if the add in the example above also has the nuw flag. For now we keep it simple and always drop the flags. Worth mentioning is that we drop the flags when cloning instructions and rebuilding the chain. This is done after the "allowsPreservingNUW" checks in ConstantOffsetExtractor::Extract. So we still take the "trunc nuw" into consideration when determining if nuw can be preserved in the gep (which should be ok since that check also require that all the involved binary operations has nuw). Fixes #154116	2025-08-22 10:27:57 +02:00
Bjorn Pettersson	4ff7ac2330	[SeparateConstOffsetFromGEP] Add test case with trunc nuw/nsw showing miscompile Pre commit a test case for issue #154116. When redistributing trunc over add/sub/or we may need to drop poison generating flags from the trunc.	2025-08-22 10:26:09 +02:00
Simon Pilgrim	8d7df8bba1	[X86] Allow AVX2 per-element shift intrinsics to be used in constexpr (#154780 ) This handles constant folding for the AVX2 per-element shift intrinsics, which handle out of bounds shift amounts (logical result = 0, arithmetic result = signbit splat) AVX512 intrinsics will follow in follow up patches First stage of #154287	2025-08-22 09:24:24 +01:00