[AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled

[AMDGPU] Precommit memory legalizer tests for private AS
Rename "Expand" to "ExpandCustom"
2025-08-22 12:45:16 +02:00 · 2025-08-22 12:45:15 +02:00 · 2025-08-22 12:44:48 +02:00 · 2025-08-22 10:14:26 +02:00
178 changed files with 118200 additions and 10156 deletions
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@ -138,12 +138,6 @@
  Dump function CFGs to graphviz format after each stage;enable '-print-loops'
  for color-coded blocks

- `--dump-dot-func=<func1,func2,func3...>`
-
-  Dump function CFGs to graphviz format for specified functions only;
-  takes function name patterns (regex supported). Note: C++ function names
-  must be passed using their mangled names
-
 - `--dump-linux-exceptions`

  Dump Linux kernel exception table
--- a/bolt/include/bolt/Utils/CommandLineOpts.h
+++ b/bolt/include/bolt/Utils/CommandLineOpts.h
@ -15,12 +15,6 @@

 #include "llvm/Support/CommandLine.h"

-namespace llvm {
-namespace bolt {
-class BinaryFunction;
-}
-} // namespace llvm
-
 namespace opts {

 enum HeatmapModeKind {
@ -106,9 +100,6 @@ extern llvm::cl::opt<unsigned> Verbosity;
 /// Return true if we should process all functions in the binary.
 bool processAllFunctions();

-/// Return true if we should dump dot graphs for the given function.
-bool shouldDumpDot(const llvm::bolt::BinaryFunction &Function);
-
 enum GadgetScannerKind { GS_PACRET, GS_PAUTH, GS_ALL };

 extern llvm::cl::bits<GadgetScannerKind> GadgetScannersToRun;
--- a/bolt/lib/Rewrite/BinaryPassManager.cpp
+++ b/bolt/lib/Rewrite/BinaryPassManager.cpp
@ -52,7 +52,6 @@ namespace opts {
 extern cl::opt<bool> PrintAll;
 extern cl::opt<bool> PrintDynoStats;
 extern cl::opt<bool> DumpDotAll;
-extern bool shouldDumpDot(const bolt::BinaryFunction &Function);
 extern cl::opt<std::string> AsmDump;
 extern cl::opt<bolt::PLTCall::OptType> PLT;
 extern cl::opt<bolt::IdenticalCodeFolding::ICFLevel, false,
@ -341,7 +340,7 @@ Error BinaryFunctionPassManager::runPasses() {

      Function.print(BC.outs(), Message);

-      if (opts::shouldDumpDot(Function))
+      if (opts::DumpDotAll)
        Function.dumpGraphForPass(PassIdName);
    }
  }
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@ -115,35 +115,6 @@ cl::opt<bool> DumpDotAll(
             "enable '-print-loops' for color-coded blocks"),
    cl::Hidden, cl::cat(BoltCategory));

-cl::list<std::string> DumpDotFunc(
-    "dump-dot-func", cl::CommaSeparated,
-    cl::desc(
-        "dump function CFGs to graphviz format for specified functions only;"
-        "takes function name patterns (regex supported)"),
-    cl::value_desc("func1,func2,func3,..."), cl::Hidden, cl::cat(BoltCategory));
-
-bool shouldDumpDot(const bolt::BinaryFunction &Function) {
-  // If dump-dot-all is enabled, dump all functions
-  if (DumpDotAll)
-    return !Function.isIgnored();
-
-  // If no specific functions specified in dump-dot-func, don't dump any
-  if (DumpDotFunc.empty())
-    return false;
-
-  if (Function.isIgnored())
-    return false;
-
-  // Check if function matches any of the specified patterns
-  for (const std::string &Name : DumpDotFunc) {
-    if (Function.hasNameRegex(Name)) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
 static cl::list<std::string>
 ForceFunctionNames("funcs",
  cl::CommaSeparated,
@ -3598,7 +3569,7 @@ void RewriteInstance::postProcessFunctions() {
    if (opts::PrintAll || opts::PrintCFG)
      Function.print(BC->outs(), "after building cfg");

-    if (opts::shouldDumpDot(Function))
+    if (opts::DumpDotAll)
      Function.dumpGraphForPass("00_build-cfg");

    if (opts::PrintLoopInfo) {
--- a/bolt/test/Inputs/multi-func.cpp
+++ b/bolt/test/Inputs/multi-func.cpp
@ -1,24 +0,0 @@
-#include <iostream>
-
-// Multiple functions to test selective dumping
-int add(int a, int b) { return a + b; }
-
-int multiply(int a, int b) { return a * b; }
-
-int main_helper() {
-  std::cout << "Helper function" << std::endl;
-  return 42;
-}
-
-int main_secondary() { return add(5, 3); }
-
-void other_function() { std::cout << "Other function" << std::endl; }
-
-int main() {
-  int result = add(10, 20);
-  result = multiply(result, 2);
-  main_helper();
-  main_secondary();
-  other_function();
-  return result;
-}
--- a/bolt/test/dump-dot-func.test
+++ b/bolt/test/dump-dot-func.test
@ -1,52 +0,0 @@
-# Test the --dump-dot-func option with multiple functions 
-# (includes tests for both mangled/unmangled names)
-
-RUN: %clang++ %p/Inputs/multi-func.cpp -o %t.exe -Wl,-q
-
-# Test 1: --dump-dot-func with specific function name (mangled)
-RUN: llvm-bolt %t.exe -o %t.bolt1 --dump-dot-func=_Z3addii -v=1 2>&1 | FileCheck %s --check-prefix=ADD
-
-# Test 2: --dump-dot-func with regex pattern (main.*)
-RUN: llvm-bolt %t.exe -o %t.bolt2 --dump-dot-func="main.*" -v=1 2>&1 | FileCheck %s --check-prefix=MAIN-REGEX
-
-# Test 3: --dump-dot-func with multiple specific functions (mangled names)
-RUN: llvm-bolt %t.exe -o %t.bolt3 --dump-dot-func=_Z3addii,_Z8multiplyii -v=1 2>&1 | FileCheck %s --check-prefix=MULTI
-
-# Test 4: No option specified should create no dot files
-RUN: llvm-bolt %t.exe -o %t.bolt4 2>&1 | FileCheck %s --check-prefix=NONE
-
-# Test 5: --dump-dot-func with non-existent function
-RUN: llvm-bolt %t.exe -o %t.bolt5 --dump-dot-func=nonexistent -v=1 2>&1 | FileCheck %s --check-prefix=NONEXISTENT
-
-# Test 6: Backward compatibility - --dump-dot-all should still work
-RUN: llvm-bolt %t.exe -o %t.bolt6 --dump-dot-all -v=1 2>&1 | FileCheck %s --check-prefix=ALL
-
-# Test 7: Test with unmangled function name (main function)
-RUN: llvm-bolt %t.exe -o %t.bolt7 --dump-dot-func=main -v=1 2>&1 | FileCheck %s --check-prefix=MAIN-UNMANGLED
-
-# Check that specific functions are dumped
-ADD: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
-ADD-NOT: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
-ADD-NOT: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
-ADD-NOT: BOLT-INFO: dumping CFG to _Z11main_helperv-00_build-cfg.dot
-
-MAIN-REGEX-DAG: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
-MAIN-REGEX-NOT: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
-MAIN-REGEX-NOT: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
-
-MULTI-DAG: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
-MULTI-DAG: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
-MULTI-NOT: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
-MULTI-NOT: BOLT-INFO: dumping CFG to _Z11main_helperv-00_build-cfg.dot
-
-# Should be no dumping messages when no option is specified
-NONE-NOT: BOLT-INFO: dumping CFG
-
-# Should be no dumping messages for non-existent function
-NONEXISTENT-NOT: BOLT-INFO: dumping CFG
-
-ALL: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
-
-MAIN-UNMANGLED: BOLT-INFO: dumping CFG to main-00_build-cfg.dot
-MAIN-UNMANGLED-NOT: BOLT-INFO: dumping CFG to _Z3addii-00_build-cfg.dot
-MAIN-UNMANGLED-NOT: BOLT-INFO: dumping CFG to _Z8multiplyii-00_build-cfg.dot
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
@ -15,12 +15,14 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::bugprone {

+namespace {
+
 // Determine if the result of an expression is "stored" in some way.
 // It is true if the value is stored into a variable or used as initialization
 // or passed to a function or constructor.
 // For this use case compound assignments are not counted as a "store" (the 'E'
 // expression should have pointer type).
-static bool isExprValueStored(const Expr *E, ASTContext &C) {
+bool isExprValueStored(const Expr *E, ASTContext &C) {
  E = E->IgnoreParenCasts();
  // Get first non-paren, non-cast parent.
  ParentMapContext &PMap = C.getParentMapContext();
@ -47,8 +49,6 @@ static bool isExprValueStored(const Expr *E, ASTContext &C) {
  return isa<CallExpr, CXXConstructExpr>(ParentE);
 }

-namespace {
-
 AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
              ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
  for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) {
--- a/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SuspiciousMissingCommaCheck.cpp
@ -14,8 +14,10 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::bugprone {

-static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
-                                            const StringLiteral *Lit) {
+namespace {
+
+bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
+                                     const StringLiteral *Lit) {
  // String literals surrounded by parentheses are assumed to be on purpose.
  //    i.e.:  const char* Array[] = { ("a" "b" "c"), "d", [...] };

@ -56,8 +58,6 @@ static bool isConcatenatedLiteralsOnPurpose(ASTContext *Ctx,
  return false;
 }

-namespace {
-
 AST_MATCHER_P(StringLiteral, isConcatenatedLiteral, unsigned,
              MaxConcatenatedTokens) {
  return Node.getNumConcatenated() > 1 &&
--- a/clang-tools-extra/clang-tidy/cert/StrToNumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cert/StrToNumCheck.cpp
@ -46,9 +46,7 @@ enum class ConversionKind {
  ToLongDouble
 };

-} // namespace
-
-static ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
+ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
  return llvm::StringSwitch<ConversionKind>(FD->getName())
      .Cases("atoi", "atol", ConversionKind::ToInt)
      .Case("atoll", ConversionKind::ToLongInt)
@ -56,8 +54,8 @@ static ConversionKind classifyConversionFunc(const FunctionDecl *FD) {
      .Default(ConversionKind::None);
 }

-static ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
-                                           const TargetInfo &TI) {
+ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
+                                    const TargetInfo &TI) {
  // Scan the format string for the first problematic format specifier, then
  // report that as the conversion type. This will miss additional conversion
  // specifiers, but that is acceptable behavior.
@ -130,7 +128,7 @@ static ConversionKind classifyFormatString(StringRef Fmt, const LangOptions &LO,
  return H.get();
 }

-static StringRef classifyConversionType(ConversionKind K) {
+StringRef classifyConversionType(ConversionKind K) {
  switch (K) {
  case ConversionKind::None:
    llvm_unreachable("Unexpected conversion kind");
@ -150,7 +148,7 @@ static StringRef classifyConversionType(ConversionKind K) {
  llvm_unreachable("Unknown conversion kind");
 }

-static StringRef classifyReplacement(ConversionKind K) {
+StringRef classifyReplacement(ConversionKind K) {
  switch (K) {
  case ConversionKind::None:
    llvm_unreachable("Unexpected conversion kind");
@ -175,6 +173,7 @@ static StringRef classifyReplacement(ConversionKind K) {
  }
  llvm_unreachable("Unknown conversion kind");
 }
+} // unnamed namespace

 void StrToNumCheck::check(const MatchFinder::MatchResult &Result) {
  const auto *Call = Result.Nodes.getNodeAs<CallExpr>("expr");
--- a/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/NewDeleteOverloadsCheck.cpp
@ -59,9 +59,7 @@ AST_MATCHER(FunctionDecl, isPlacementOverload) {
  return true;
 }

-} // namespace
-
-static OverloadedOperatorKind getCorrespondingOverload(const FunctionDecl *FD) {
+OverloadedOperatorKind getCorrespondingOverload(const FunctionDecl *FD) {
  switch (FD->getOverloadedOperator()) {
  default:
    break;
@ -77,7 +75,7 @@ static OverloadedOperatorKind getCorrespondingOverload(const FunctionDecl *FD) {
  llvm_unreachable("Not an overloaded allocation operator");
 }

-static const char *getOperatorName(OverloadedOperatorKind K) {
+const char *getOperatorName(OverloadedOperatorKind K) {
  switch (K) {
  default:
    break;
@ -93,14 +91,13 @@ static const char *getOperatorName(OverloadedOperatorKind K) {
  llvm_unreachable("Not an overloaded allocation operator");
 }

-static bool areCorrespondingOverloads(const FunctionDecl *LHS,
-                                      const FunctionDecl *RHS) {
+bool areCorrespondingOverloads(const FunctionDecl *LHS,
+                               const FunctionDecl *RHS) {
  return RHS->getOverloadedOperator() == getCorrespondingOverload(LHS);
 }

-static bool
-hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
-                                    const CXXRecordDecl *RD = nullptr) {
+bool hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
+                                         const CXXRecordDecl *RD = nullptr) {
  if (RD) {
    // Check the methods in the given class and accessible to derived classes.
    for (const auto *BMD : RD->methods())
@ -127,6 +124,8 @@ hasCorrespondingOverloadInBaseClass(const CXXMethodDecl *MD,
  return false;
 }

+} // anonymous namespace
+
 void NewDeleteOverloadsCheck::registerMatchers(MatchFinder *Finder) {
  // Match all operator new and operator delete overloads (including the array
  // forms). Do not match implicit operators, placement operators, or
--- a/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MacroToEnumCheck.cpp
@ -395,12 +395,16 @@ void MacroToEnumCallbacks::Endif(SourceLocation Loc, SourceLocation IfLoc) {
  --CurrentFile->ConditionScopes;
 }

+namespace {
+
 template <size_t N>
-static bool textEquals(const char (&Needle)[N], const char *HayStack) {
+bool textEquals(const char (&Needle)[N], const char *HayStack) {
  return StringRef{HayStack, N - 1} == Needle;
 }

-template <size_t N> static size_t len(const char (&)[N]) { return N - 1; }
+template <size_t N> size_t len(const char (&)[N]) { return N - 1; }
+
+} // namespace

 void MacroToEnumCallbacks::PragmaDirective(SourceLocation Loc,
                                           PragmaIntroducerKind Introducer) {
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@ -16,13 +16,14 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::modernize {

-static constexpr char ConstructorCall[] = "constructorCall";
-static constexpr char ResetCall[] = "resetCall";
-static constexpr char NewExpression[] = "newExpression";
+namespace {

-static std::string getNewExprName(const CXXNewExpr *NewExpr,
-                                  const SourceManager &SM,
-                                  const LangOptions &Lang) {
+constexpr char ConstructorCall[] = "constructorCall";
+constexpr char ResetCall[] = "resetCall";
+constexpr char NewExpression[] = "newExpression";
+
+std::string getNewExprName(const CXXNewExpr *NewExpr, const SourceManager &SM,
+                           const LangOptions &Lang) {
  StringRef WrittenName = Lexer::getSourceText(
      CharSourceRange::getTokenRange(
          NewExpr->getAllocatedTypeSourceInfo()->getTypeLoc().getSourceRange()),
@ -33,6 +34,8 @@ static std::string getNewExprName(const CXXNewExpr *NewExpr,
  return WrittenName.str();
 }

+} // namespace
+
 const char MakeSmartPtrCheck::PointerType[] = "pointerType";

 MakeSmartPtrCheck::MakeSmartPtrCheck(StringRef Name, ClangTidyContext *Context,
--- a/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RawStringLiteralCheck.cpp
@ -19,7 +19,9 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::modernize {

-static bool containsEscapes(StringRef HayStack, StringRef Escapes) {
+namespace {
+
+bool containsEscapes(StringRef HayStack, StringRef Escapes) {
  size_t BackSlash = HayStack.find('\\');
  if (BackSlash == StringRef::npos)
    return false;
@ -33,16 +35,16 @@ static bool containsEscapes(StringRef HayStack, StringRef Escapes) {
  return true;
 }

-static bool isRawStringLiteral(StringRef Text) {
+bool isRawStringLiteral(StringRef Text) {
  // Already a raw string literal if R comes before ".
  const size_t QuotePos = Text.find('"');
  assert(QuotePos != StringRef::npos);
  return (QuotePos > 0) && (Text[QuotePos - 1] == 'R');
 }

-static bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
-                                      const StringLiteral *Literal,
-                                      const CharsBitSet &DisallowedChars) {
+bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
+                               const StringLiteral *Literal,
+                               const CharsBitSet &DisallowedChars) {
  // FIXME: Handle L"", u8"", u"" and U"" literals.
  if (!Literal->isOrdinary())
    return false;
@ -62,12 +64,14 @@ static bool containsEscapedCharacters(const MatchFinder::MatchResult &Result,
  return containsEscapes(Text, R"('\"?x01)");
 }

-static bool containsDelimiter(StringRef Bytes, const std::string &Delimiter) {
+bool containsDelimiter(StringRef Bytes, const std::string &Delimiter) {
  return Bytes.find(Delimiter.empty()
                        ? std::string(R"lit()")lit")
                        : (")" + Delimiter + R"(")")) != StringRef::npos;
 }

+} // namespace
+
 RawStringLiteralCheck::RawStringLiteralCheck(StringRef Name,
                                             ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSInvocationArgumentLifetimeCheck.cpp
@ -29,13 +29,12 @@
 using namespace clang::ast_matchers;

 namespace clang::tidy::objc {
+namespace {

 static constexpr StringRef WeakText = "__weak";
 static constexpr StringRef StrongText = "__strong";
 static constexpr StringRef UnsafeUnretainedText = "__unsafe_unretained";

-namespace {
-
 /// Matches ObjCIvarRefExpr, DeclRefExpr, or MemberExpr that reference
 /// Objective-C object (or block) variables or fields whose object lifetimes
 /// are not __unsafe_unretained.
@ -50,8 +49,6 @@ AST_POLYMORPHIC_MATCHER(isObjCManagedLifetime,
         QT.getQualifiers().getObjCLifetime() > Qualifiers::OCL_ExplicitNone;
 }

-} // namespace
-
 static std::optional<FixItHint>
 fixItHintReplacementForOwnershipString(StringRef Text, CharSourceRange Range,
                                       StringRef Ownership) {
@ -96,6 +93,8 @@ fixItHintForVarDecl(const VarDecl *VD, const SourceManager &SM,
  return FixItHint::CreateInsertion(Range.getBegin(), "__unsafe_unretained ");
 }

+} // namespace
+
 void NSInvocationArgumentLifetimeCheck::registerMatchers(MatchFinder *Finder) {
  Finder->addMatcher(
      traverse(
--- a/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/PropertyDeclarationCheck.cpp
@ -27,14 +27,11 @@ enum NamingStyle {
  CategoryProperty = 2,
 };

-} // namespace
-
 /// For now we will only fix 'CamelCase' or 'abc_CamelCase' property to
 /// 'camelCase' or 'abc_camelCase'. For other cases the users need to
 /// come up with a proper name by their own.
 /// FIXME: provide fix for snake_case to snakeCase
-static FixItHint generateFixItHint(const ObjCPropertyDecl *Decl,
-                                   NamingStyle Style) {
+FixItHint generateFixItHint(const ObjCPropertyDecl *Decl, NamingStyle Style) {
  auto Name = Decl->getName();
  auto NewName = Decl->getName().str();
  size_t Index = 0;
@ -53,7 +50,7 @@ static FixItHint generateFixItHint(const ObjCPropertyDecl *Decl,
  return {};
 }

-static std::string validPropertyNameRegex(bool UsedInMatcher) {
+std::string validPropertyNameRegex(bool UsedInMatcher) {
  // Allow any of these names:
  // foo
  // fooBar
@ -75,13 +72,13 @@ static std::string validPropertyNameRegex(bool UsedInMatcher) {
  return StartMatcher + "([a-z]|[A-Z][A-Z0-9])[a-z0-9A-Z]*$";
 }

-static bool hasCategoryPropertyPrefix(llvm::StringRef PropertyName) {
+bool hasCategoryPropertyPrefix(llvm::StringRef PropertyName) {
  auto RegexExp =
      llvm::Regex("^[a-zA-Z][a-zA-Z0-9]*_[a-zA-Z0-9][a-zA-Z0-9_]+$");
  return RegexExp.match(PropertyName);
 }

-static bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
+bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
  size_t Start = PropertyName.find_first_of('_');
  assert(Start != llvm::StringRef::npos && Start + 1 < PropertyName.size());
  auto Prefix = PropertyName.substr(0, Start);
@ -91,6 +88,7 @@ static bool prefixedPropertyNameValid(llvm::StringRef PropertyName) {
  auto RegexExp = llvm::Regex(llvm::StringRef(validPropertyNameRegex(false)));
  return RegexExp.match(PropertyName.substr(Start + 1));
 }
+} // namespace

 void PropertyDeclarationCheck::registerMatchers(MatchFinder *Finder) {
  Finder->addMatcher(objcPropertyDecl(
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryCopyInitialization.cpp
@ -17,6 +17,7 @@
 #include <optional>

 namespace clang::tidy::performance {
+namespace {

 using namespace ::clang::ast_matchers;
 using llvm::StringRef;
@ -29,8 +30,8 @@ static constexpr StringRef MethodDeclId = "methodDecl";
 static constexpr StringRef FunctionDeclId = "functionDecl";
 static constexpr StringRef OldVarDeclId = "oldVarDecl";

-static void recordFixes(const VarDecl &Var, ASTContext &Context,
-                        DiagnosticBuilder &Diagnostic) {
+void recordFixes(const VarDecl &Var, ASTContext &Context,
+                 DiagnosticBuilder &Diagnostic) {
  Diagnostic << utils::fixit::changeVarDeclToReference(Var, Context);
  if (!Var.getType().isLocalConstQualified()) {
    if (std::optional<FixItHint> Fix = utils::fixit::addQualifierToVarDecl(
@ -39,8 +40,8 @@ static void recordFixes(const VarDecl &Var, ASTContext &Context,
  }
 }

-static std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
-                                                          SourceManager &SM) {
+std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
+                                                   SourceManager &SM) {
  bool Invalid = false;
  const char *TextAfter = SM.getCharacterData(Loc, &Invalid);
  if (Invalid) {
@ -50,8 +51,8 @@ static std::optional<SourceLocation> firstLocAfterNewLine(SourceLocation Loc,
  return Loc.getLocWithOffset(TextAfter[Offset] == '\0' ? Offset : Offset + 1);
 }

-static void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
-                          DiagnosticBuilder &Diagnostic) {
+void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
+                   DiagnosticBuilder &Diagnostic) {
  auto &SM = Context.getSourceManager();
  // Attempt to remove trailing comments as well.
  auto Tok = utils::lexer::findNextTokenSkippingComments(Stmt.getEndLoc(), SM,
@ -73,8 +74,6 @@ static void recordRemoval(const DeclStmt &Stmt, ASTContext &Context,
  }
 }

-namespace {
-
 AST_MATCHER_FUNCTION_P(StatementMatcher,
                       isRefReturningMethodCallWithConstOverloads,
                       std::vector<StringRef>, ExcludedContainerTypes) {
@ -131,8 +130,6 @@ AST_MATCHER_FUNCTION_P(StatementMatcher, initializerReturnsReferenceToConst,
                                           hasUnaryOperand(OldVarDeclRef)))));
 }

-} // namespace
-
 // This checks that the variable itself is only used as const, and also makes
 // sure that it does not reference another variable that could be modified in
 // the BlockStmt. It does this by checking the following:
@ -183,13 +180,13 @@ static bool isInitializingVariableImmutable(
  return false;
 }

-static bool isVariableUnused(const VarDecl &Var, const Stmt &BlockStmt,
-                             ASTContext &Context) {
+bool isVariableUnused(const VarDecl &Var, const Stmt &BlockStmt,
+                      ASTContext &Context) {
  return allDeclRefExprs(Var, BlockStmt, Context).empty();
 }

-static const SubstTemplateTypeParmType *
-getSubstitutedType(const QualType &Type, ASTContext &Context) {
+const SubstTemplateTypeParmType *getSubstitutedType(const QualType &Type,
+                                                    ASTContext &Context) {
  auto Matches = match(
      qualType(anyOf(substTemplateTypeParmType().bind("subst"),
                     hasDescendant(substTemplateTypeParmType().bind("subst")))),
@ -197,9 +194,9 @@ getSubstitutedType(const QualType &Type, ASTContext &Context) {
  return selectFirst<SubstTemplateTypeParmType>("subst", Matches);
 }

-static bool differentReplacedTemplateParams(const QualType &VarType,
-                                            const QualType &InitializerType,
-                                            ASTContext &Context) {
+bool differentReplacedTemplateParams(const QualType &VarType,
+                                     const QualType &InitializerType,
+                                     ASTContext &Context) {
  if (const SubstTemplateTypeParmType *VarTmplType =
          getSubstitutedType(VarType, Context)) {
    if (const SubstTemplateTypeParmType *InitializerTmplType =
@ -215,8 +212,8 @@ static bool differentReplacedTemplateParams(const QualType &VarType,
  return false;
 }

-static QualType constructorArgumentType(const VarDecl *OldVar,
-                                        const BoundNodes &Nodes) {
+QualType constructorArgumentType(const VarDecl *OldVar,
+                                 const BoundNodes &Nodes) {
  if (OldVar) {
    return OldVar->getType();
  }
@ -227,6 +224,8 @@ static QualType constructorArgumentType(const VarDecl *OldVar,
  return MethodDecl->getReturnType();
 }

+} // namespace
+
 UnnecessaryCopyInitialization::UnnecessaryCopyInitialization(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UnnecessaryValueParamCheck.cpp
@ -21,14 +21,16 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::performance {

-static std::string paramNameOrIndex(StringRef Name, size_t Index) {
+namespace {
+
+std::string paramNameOrIndex(StringRef Name, size_t Index) {
  return (Name.empty() ? llvm::Twine('#') + llvm::Twine(Index + 1)
                       : llvm::Twine('\'') + Name + llvm::Twine('\''))
      .str();
 }

-static bool hasLoopStmtAncestor(const DeclRefExpr &DeclRef, const Decl &Decl,
-                                ASTContext &Context) {
+bool hasLoopStmtAncestor(const DeclRefExpr &DeclRef, const Decl &Decl,
+                         ASTContext &Context) {
  auto Matches = match(
      traverse(TK_AsIs,
               decl(forEachDescendant(declRefExpr(
@ -39,6 +41,8 @@ static bool hasLoopStmtAncestor(const DeclRefExpr &DeclRef, const Decl &Decl,
  return Matches.empty();
 }

+} // namespace
+
 UnnecessaryValueParamCheck::UnnecessaryValueParamCheck(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/EnumInitialValueCheck.cpp
@ -122,15 +122,15 @@ AST_MATCHER(EnumDecl, hasSequentialInitialValues) {
  return !AllEnumeratorsArePowersOfTwo;
 }

-} // namespace
-
-static std::string getName(const EnumDecl *Decl) {
+std::string getName(const EnumDecl *Decl) {
  if (!Decl->getDeclName())
    return "<unnamed>";

  return Decl->getQualifiedNameAsString();
 }

+} // namespace
+
 EnumInitialValueCheck::EnumInitialValueCheck(StringRef Name,
                                             ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/FunctionCognitiveComplexityCheck.cpp
@ -144,8 +144,6 @@ struct CognitiveComplexity final {
  void account(SourceLocation Loc, unsigned short Nesting, Criteria C);
 };

-} // namespace
-
 // All the possible messages that can be output. The choice of the message
 // to use is based of the combination of the CognitiveComplexity::Criteria.
 // It would be nice to have it in CognitiveComplexity struct, but then it is
@ -165,27 +163,23 @@ static const std::array<const StringRef, 4> Msgs = {{
 }};

 // Criteria is a bitset, thus a few helpers are needed.
-static CognitiveComplexity::Criteria
-operator|(CognitiveComplexity::Criteria LHS,
-          CognitiveComplexity::Criteria RHS) {
+CognitiveComplexity::Criteria operator|(CognitiveComplexity::Criteria LHS,
+                                        CognitiveComplexity::Criteria RHS) {
  return static_cast<CognitiveComplexity::Criteria>(llvm::to_underlying(LHS) |
                                                    llvm::to_underlying(RHS));
 }
-static CognitiveComplexity::Criteria
-operator&(CognitiveComplexity::Criteria LHS,
-          CognitiveComplexity::Criteria RHS) {
+CognitiveComplexity::Criteria operator&(CognitiveComplexity::Criteria LHS,
+                                        CognitiveComplexity::Criteria RHS) {
  return static_cast<CognitiveComplexity::Criteria>(llvm::to_underlying(LHS) &
                                                    llvm::to_underlying(RHS));
 }
-static CognitiveComplexity::Criteria &
-operator|=(CognitiveComplexity::Criteria &LHS,
-           CognitiveComplexity::Criteria RHS) {
+CognitiveComplexity::Criteria &operator|=(CognitiveComplexity::Criteria &LHS,
+                                          CognitiveComplexity::Criteria RHS) {
  LHS = operator|(LHS, RHS);
  return LHS;
 }
-static CognitiveComplexity::Criteria &
-operator&=(CognitiveComplexity::Criteria &LHS,
-           CognitiveComplexity::Criteria RHS) {
+CognitiveComplexity::Criteria &operator&=(CognitiveComplexity::Criteria &LHS,
+                                          CognitiveComplexity::Criteria RHS) {
  LHS = operator&(LHS, RHS);
  return LHS;
 }
@ -205,8 +199,6 @@ void CognitiveComplexity::account(SourceLocation Loc, unsigned short Nesting,
  Total += Increase;
 }

-namespace {
-
 class FunctionASTVisitor final
    : public RecursiveASTVisitor<FunctionASTVisitor> {
  using Base = RecursiveASTVisitor<FunctionASTVisitor>;
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@ -41,11 +41,9 @@ AST_MATCHER(Stmt, isNULLMacroExpansion) {
  return isNULLMacroExpansion(&Node, Finder->getASTContext());
 }

-} // namespace
-
-static StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
-                                                    QualType Type,
-                                                    ASTContext &Context) {
+StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
+                                             QualType Type,
+                                             ASTContext &Context) {
  switch (CastExprKind) {
  case CK_IntegralToBoolean:
    return Type->isUnsignedIntegerType() ? "0u" : "0";
@ -64,15 +62,15 @@ static StringRef getZeroLiteralToCompareWithForType(CastKind CastExprKind,
  }
 }

-static bool isUnaryLogicalNotOperator(const Stmt *Statement) {
+bool isUnaryLogicalNotOperator(const Stmt *Statement) {
  const auto *UnaryOperatorExpr = dyn_cast<UnaryOperator>(Statement);
  return UnaryOperatorExpr && UnaryOperatorExpr->getOpcode() == UO_LNot;
 }

-static void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
-                                     const ImplicitCastExpr *Cast,
-                                     const Stmt *Parent, ASTContext &Context,
-                                     bool UseUpperCaseLiteralSuffix) {
+void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
+                              const ImplicitCastExpr *Cast, const Stmt *Parent,
+                              ASTContext &Context,
+                              bool UseUpperCaseLiteralSuffix) {
  // In case of expressions like (! integer), we should remove the redundant not
  // operator and use inverted comparison (integer == 0).
  bool InvertComparison =
@ -135,8 +133,8 @@ static void fixGenericExprCastToBool(DiagnosticBuilder &Diag,
  Diag << FixItHint::CreateInsertion(EndLoc, EndLocInsertion);
 }

-static StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
-                                                 ASTContext &Context) {
+StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
+                                          ASTContext &Context) {
  if (isNULLMacroExpansion(Expression, Context)) {
    return "false";
  }
@ -163,7 +161,7 @@ static StringRef getEquivalentBoolLiteralForExpr(const Expr *Expression,
  return {};
 }

-static bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
+bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
  SourceRange PrefixRange(Loc.getLocWithOffset(-1), Loc);
  StringRef SpaceBeforeStmtStr = Lexer::getSourceText(
      CharSourceRange::getCharRange(PrefixRange), Context.getSourceManager(),
@ -175,10 +173,9 @@ static bool needsSpacePrefix(SourceLocation Loc, ASTContext &Context) {
  return !AllowedCharacters.contains(SpaceBeforeStmtStr.back());
 }

-static void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
-                                       const ImplicitCastExpr *Cast,
-                                       ASTContext &Context,
-                                       StringRef OtherType) {
+void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
+                                const ImplicitCastExpr *Cast,
+                                ASTContext &Context, StringRef OtherType) {
  if (!Context.getLangOpts().CPlusPlus) {
    Diag << FixItHint::CreateInsertion(Cast->getBeginLoc(),
                                       (Twine("(") + OtherType + ")").str());
@ -203,9 +200,8 @@ static void fixGenericExprCastFromBool(DiagnosticBuilder &Diag,
  }
 }

-static StringRef
-getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
-                            QualType DestType, ASTContext &Context) {
+StringRef getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
+                                      QualType DestType, ASTContext &Context) {
  // Prior to C++11, false literal could be implicitly converted to pointer.
  if (!Context.getLangOpts().CPlusPlus11 &&
      (DestType->isPointerType() || DestType->isMemberPointerType()) &&
@ -226,8 +222,8 @@ getEquivalentForBoolLiteral(const CXXBoolLiteralExpr *BoolLiteral,
  return BoolLiteral->getValue() ? "1" : "0";
 }

-static bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
-                                     ASTContext &Context) {
+bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
+                              ASTContext &Context) {
  std::queue<const Stmt *> Q;
  Q.push(Cast);

@ -255,6 +251,8 @@ static bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
  return false;
 }

+} // anonymous namespace
+
 ImplicitBoolConversionCheck::ImplicitBoolConversionCheck(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/QualifiedAutoCheck.cpp
@ -28,11 +28,8 @@ AST_MATCHER_P(QualType, hasUnqualifiedType,

 enum class Qualifier { Const, Volatile, Restrict };

-} // namespace
-
-static std::optional<Token>
-findQualToken(const VarDecl *Decl, Qualifier Qual,
-              const MatchFinder::MatchResult &Result) {
+std::optional<Token> findQualToken(const VarDecl *Decl, Qualifier Qual,
+                                   const MatchFinder::MatchResult &Result) {
  // Since either of the locs can be in a macro, use `makeFileCharRange` to be
  // sure that we have a consistent `CharSourceRange`, located entirely in the
  // source file.
@ -61,7 +58,7 @@ findQualToken(const VarDecl *Decl, Qualifier Qual,
                                          *Result.SourceManager);
 }

-static std::optional<SourceRange>
+std::optional<SourceRange>
 getTypeSpecifierLocation(const VarDecl *Var,
                         const MatchFinder::MatchResult &Result) {
  SourceRange TypeSpecifier(
@ -76,8 +73,8 @@ getTypeSpecifierLocation(const VarDecl *Var,
  return TypeSpecifier;
 }

-static std::optional<SourceRange>
-mergeReplacementRange(SourceRange &TypeSpecifier, const Token &ConstToken) {
+std::optional<SourceRange> mergeReplacementRange(SourceRange &TypeSpecifier,
+                                                 const Token &ConstToken) {
  if (TypeSpecifier.getBegin().getLocWithOffset(-1) == ConstToken.getEndLoc()) {
    TypeSpecifier.setBegin(ConstToken.getLocation());
    return std::nullopt;
@ -89,19 +86,21 @@ mergeReplacementRange(SourceRange &TypeSpecifier, const Token &ConstToken) {
  return SourceRange(ConstToken.getLocation(), ConstToken.getEndLoc());
 }

-static bool isPointerConst(QualType QType) {
+bool isPointerConst(QualType QType) {
  QualType Pointee = QType->getPointeeType();
  assert(!Pointee.isNull() && "can't have a null Pointee");
  return Pointee.isConstQualified();
 }

-static bool isAutoPointerConst(QualType QType) {
+bool isAutoPointerConst(QualType QType) {
  QualType Pointee =
      cast<AutoType>(QType->getPointeeType().getTypePtr())->desugar();
  assert(!Pointee.isNull() && "can't have a null Pointee");
  return Pointee.isConstQualified();
 }

+} // namespace
+
 QualifiedAutoCheck::QualifiedAutoCheck(StringRef Name,
                                       ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
--- a/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantControlFlowCheck.cpp
@ -14,18 +14,19 @@ using namespace clang::ast_matchers;

 namespace clang::tidy::readability {

-static const char *const RedundantReturnDiag =
-    "redundant return statement at the end "
-    "of a function with a void return type";
-static const char *const RedundantContinueDiag =
-    "redundant continue statement at the "
-    "end of loop statement";
+namespace {

-static bool isLocationInMacroExpansion(const SourceManager &SM,
-                                       SourceLocation Loc) {
+const char *const RedundantReturnDiag = "redundant return statement at the end "
+                                        "of a function with a void return type";
+const char *const RedundantContinueDiag = "redundant continue statement at the "
+                                          "end of loop statement";
+
+bool isLocationInMacroExpansion(const SourceManager &SM, SourceLocation Loc) {
  return SM.isMacroBodyExpansion(Loc) || SM.isMacroArgExpansion(Loc);
 }

+} // namespace
+
 void RedundantControlFlowCheck::registerMatchers(MatchFinder *Finder) {
  Finder->addMatcher(
      functionDecl(isDefinition(), returns(voidType()),
--- a/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TypeTraits.cpp
@ -13,14 +13,16 @@

 namespace clang::tidy::utils::type_traits {

-static bool classHasTrivialCopyAndDestroy(QualType Type) {
+namespace {
+
+bool classHasTrivialCopyAndDestroy(QualType Type) {
  auto *Record = Type->getAsCXXRecordDecl();
  return Record && Record->hasDefinition() &&
         !Record->hasNonTrivialCopyConstructor() &&
         !Record->hasNonTrivialDestructor();
 }

-static bool hasDeletedCopyConstructor(QualType Type) {
+bool hasDeletedCopyConstructor(QualType Type) {
  auto *Record = Type->getAsCXXRecordDecl();
  if (!Record || !Record->hasDefinition())
    return false;
@ -31,6 +33,8 @@ static bool hasDeletedCopyConstructor(QualType Type) {
  return false;
 }

+} // namespace
+
 std::optional<bool> isExpensiveToCopy(QualType Type,
                                      const ASTContext &Context) {
  if (Type->isDependentType() || Type->isIncompleteType())
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -309,13 +309,6 @@ NVPTX Support

 X86 Support
 ^^^^^^^^^^^
- More SSE, AVX and AVX512 intrinsics, including initializers and general
-  arithmetic can now be used in C++ constant expressions.
- Some SSE, AVX and AVX512 intrinsics have been converted to wrap
-  generic __builtin intrinsics.
- NOTE: Please avoid use of the __builtin_ia32_* intrinsics - these are not 
-  guaranteed to exist in future releases, or match behaviour with previous
-  releases of clang or other compilers.

 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@ -627,23 +627,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
 let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
  def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
  def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
-
-  def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
-  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-
-  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
-  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
-  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
 }

-let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
-  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
+  def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
 }

 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@ -666,6 +654,46 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
  def maskstoreq : X86Builtin<"void(_Vector<2, long long int *>, _Vector<2, long long int>, _Vector<2, long long int>)">;
 }

+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrav8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrav4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+  def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
+}
+
+let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+  def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
+}
+
 let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
  def gatherd_pd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, double const *, _Vector<4, int>, _Vector<2, double>, _Constant char)">;
 }
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@ -11669,24 +11669,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
  case clang::X86::BI__builtin_ia32_pmulhuw512:
  case clang::X86::BI__builtin_ia32_pmulhw128:
  case clang::X86::BI__builtin_ia32_pmulhw256:
-  case clang::X86::BI__builtin_ia32_pmulhw512:
-  case clang::X86::BI__builtin_ia32_psllv2di:
-  case clang::X86::BI__builtin_ia32_psllv4di:
-  case clang::X86::BI__builtin_ia32_psllv4si:
-  case clang::X86::BI__builtin_ia32_psllv8si:
-  case clang::X86::BI__builtin_ia32_psrav4si:
-  case clang::X86::BI__builtin_ia32_psrav8si:
-  case clang::X86::BI__builtin_ia32_psrlv2di:
-  case clang::X86::BI__builtin_ia32_psrlv4di:
-  case clang::X86::BI__builtin_ia32_psrlv4si:
-  case clang::X86::BI__builtin_ia32_psrlv8si:{
+  case clang::X86::BI__builtin_ia32_pmulhw512: {
    APValue SourceLHS, SourceRHS;
    if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) ||
        !EvaluateAsRValue(Info, E->getArg(1), SourceRHS))
      return false;

    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
-    bool DestUnsigned = DestEltTy->isUnsignedIntegerOrEnumerationType();
    unsigned SourceLen = SourceLHS.getVectorLength();
    SmallVector<APValue, 4> ResultElements;
    ResultElements.reserve(SourceLen);
@ -11698,12 +11687,12 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
      case Builtin::BI__builtin_elementwise_add_sat:
        ResultElements.push_back(APValue(
            APSInt(LHS.isSigned() ? LHS.sadd_sat(RHS) : LHS.uadd_sat(RHS),
-                   DestUnsigned)));
+                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
        break;
      case Builtin::BI__builtin_elementwise_sub_sat:
        ResultElements.push_back(APValue(
            APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS),
-                   DestUnsigned)));
+                   DestEltTy->isUnsignedIntegerOrEnumerationType())));
        break;
      case clang::X86::BI__builtin_ia32_pmulhuw128:
      case clang::X86::BI__builtin_ia32_pmulhuw256:
@ -11717,40 +11706,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
        ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS),
                                                /*isUnsigned=*/false)));
        break;
-      case clang::X86::BI__builtin_ia32_psllv2di:
-      case clang::X86::BI__builtin_ia32_psllv4di:
-      case clang::X86::BI__builtin_ia32_psllv4si:
-      case clang::X86::BI__builtin_ia32_psllv8si:
-        if (RHS.uge(RHS.getBitWidth())) {
-          ResultElements.push_back(
-              APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
-          break;
-        }
-        ResultElements.push_back(
-            APValue(APSInt(LHS.shl(RHS.getZExtValue()), DestUnsigned)));
-        break;
-      case clang::X86::BI__builtin_ia32_psrav4si:
-      case clang::X86::BI__builtin_ia32_psrav8si:
-        if (RHS.uge(RHS.getBitWidth())) {
-          ResultElements.push_back(
-              APValue(APSInt(LHS.ashr(RHS.getBitWidth() - 1), DestUnsigned)));
-          break;
-        }
-        ResultElements.push_back(
-            APValue(APSInt(LHS.ashr(RHS.getZExtValue()), DestUnsigned)));
-        break;
-      case clang::X86::BI__builtin_ia32_psrlv2di:
-      case clang::X86::BI__builtin_ia32_psrlv4di:
-      case clang::X86::BI__builtin_ia32_psrlv4si:
-      case clang::X86::BI__builtin_ia32_psrlv8si:
-        if (RHS.uge(RHS.getBitWidth())) {
-          ResultElements.push_back(
-              APValue(APSInt(APInt::getZero(RHS.getBitWidth()), DestUnsigned)));
-          break;
-        }
-        ResultElements.push_back(
-            APValue(APSInt(LHS.lshr(RHS.getZExtValue()), DestUnsigned)));
-        break;
      }
    }

--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@ -3721,7 +3721,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
@ -3743,7 +3743,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_sllv_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
@ -3765,7 +3765,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
@ -3787,7 +3787,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_sllv_epi64(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
@ -3810,7 +3810,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srav_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
@ -3833,7 +3833,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srav_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
@ -3855,7 +3855,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [8 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
@ -3877,7 +3877,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srlv_epi32(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
@ -3899,7 +3899,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
 ///    A 256-bit vector of [4 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 {
  return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
@ -3921,7 +3921,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 ///    A 128-bit vector of [2 x i64] containing the unsigned shift counts (in
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_srlv_epi64(__m128i __X, __m128i __Y)
 {
  return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@ -327,6 +327,7 @@ __m256i test_mm256_cvtepi8_epi16(__m128i a) {
  // CHECK: sext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepi8_epi16(a);
 }
+
 TEST_CONSTEXPR(match_v16hi(_mm256_cvtepi8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12));

 __m256i test_mm256_cvtepi8_epi32(__m128i a) {
@ -335,6 +336,7 @@ __m256i test_mm256_cvtepi8_epi32(__m128i a) {
  // CHECK: sext <8 x i8> %{{.*}} to <8 x i32>
  return _mm256_cvtepi8_epi32(a);
 }
+
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepi8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0, 1, -2, 3, -4));

 __m256i test_mm256_cvtepi8_epi64(__m128i a) {
@ -343,6 +345,7 @@ __m256i test_mm256_cvtepi8_epi64(__m128i a) {
  // CHECK: sext <4 x i8> %{{.*}} to <4 x i64>
  return _mm256_cvtepi8_epi64(a);
 }
+
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), -3, 2, -1, 0));

 __m256i test_mm256_cvtepi16_epi32(__m128i a) {
@ -350,6 +353,7 @@ __m256i test_mm256_cvtepi16_epi32(__m128i a) {
  // CHECK: sext <8 x i16> %{{.*}} to <8 x i32>
  return _mm256_cvtepi16_epi32(a);
 }
+
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepi16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0, 1, -2, 3, -4));

 __m256i test_mm256_cvtepi16_epi64(__m128i a) {
@ -358,6 +362,7 @@ __m256i test_mm256_cvtepi16_epi64(__m128i a) {
  // CHECK: sext <4 x i16> %{{.*}} to <4 x i64>
  return _mm256_cvtepi16_epi64(a);
 }
+
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), -300, 2, -1, 0));

 __m256i test_mm256_cvtepi32_epi64(__m128i a) {
@ -365,6 +370,7 @@ __m256i test_mm256_cvtepi32_epi64(__m128i a) {
  // CHECK: sext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepi32_epi64(a);
 }
+
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepi32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), -70000, 2, -1, 0));

 __m256i test_mm256_cvtepu8_epi16(__m128i a) {
@ -372,6 +378,7 @@ __m256i test_mm256_cvtepu8_epi16(__m128i a) {
  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepu8_epi16(a);
 }
+
 TEST_CONSTEXPR(match_v16hi(_mm256_cvtepu8_epi16(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252, 5, 250, 7, 248, 9, 246, 11, 244));

 __m256i test_mm256_cvtepu8_epi32(__m128i a) {
@ -380,6 +387,7 @@ __m256i test_mm256_cvtepu8_epi32(__m128i a) {
  // CHECK: zext <8 x i8> %{{.*}} to <8 x i32>
  return _mm256_cvtepu8_epi32(a);
 }
+
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepu8_epi32(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0, 1, 254, 3, 252));

 __m256i test_mm256_cvtepu8_epi64(__m128i a) {
@ -388,6 +396,7 @@ __m256i test_mm256_cvtepu8_epi64(__m128i a) {
  // CHECK: zext <4 x i8> %{{.*}} to <4 x i64>
  return _mm256_cvtepu8_epi64(a);
 }
+
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu8_epi64(_mm_setr_epi8(-3, 2, -1, 0, 1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12)), 253, 2, 255, 0));

 __m256i test_mm256_cvtepu16_epi32(__m128i a) {
@ -395,6 +404,7 @@ __m256i test_mm256_cvtepu16_epi32(__m128i a) {
  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
  return _mm256_cvtepu16_epi32(a);
 }
+
 TEST_CONSTEXPR(match_v8si(_mm256_cvtepu16_epi32(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0, 1, 65534, 3, 65532));

 __m256i test_mm256_cvtepu16_epi64(__m128i a) {
@ -403,6 +413,7 @@ __m256i test_mm256_cvtepu16_epi64(__m128i a) {
  // CHECK: zext <4 x i16> %{{.*}} to <4 x i64>
  return _mm256_cvtepu16_epi64(a);
 }
+
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu16_epi64(_mm_setr_epi16(-300, 2, -1, 0, 1, -2, 3, -4)), 65236, 2, 65535, 0));

 __m256i test_mm256_cvtepu32_epi64(__m128i a) {
@ -410,6 +421,7 @@ __m256i test_mm256_cvtepu32_epi64(__m128i a) {
  // CHECK: zext <4 x i32> %{{.*}} to <4 x i64>
  return _mm256_cvtepu32_epi64(a);
 }
+
 TEST_CONSTEXPR(match_v4di(_mm256_cvtepu32_epi64(_mm_setr_epi32(-70000, 2, -1, 0)), 4294897296, 2, 4294967295, 0));

 __m128i test0_mm256_extracti128_si256_0(__m256i a) {
@ -1108,28 +1120,24 @@ __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_sllv_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v4si(_mm_sllv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 2, -8, 24, 0));

 __m256i test_mm256_sllv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_sllv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_sllv_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v8si(_mm256_sllv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 2, -8, 24, -64, 0, 0, 0, 0));

 __m128i test_mm_sllv_epi64(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_sllv_epi64
  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_sllv_epi64(a, b);
 }
-TEST_CONSTEXPR(match_m128i(_mm_sllv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 256, 0x8000000000000000ULL));

 __m256i test_mm256_sllv_epi64(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_sllv_epi64
  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
  return _mm256_sllv_epi64(a, b);
 }
-TEST_CONSTEXPR(match_m256i(_mm256_sllv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 2, -8, 24, 0));

 __m256i test_mm256_sra_epi16(__m256i a, __m128i b) {
  // CHECK-LABEL: test_mm256_sra_epi16
@ -1172,14 +1180,12 @@ __m128i test_mm_srav_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_srav_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v4si(_mm_srav_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, -1, 0, -1));

 __m256i test_mm256_srav_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srav_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srav_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v8si(_mm256_srav_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, -1, 0, -1, 0, -1, 0, -1));

 __m256i test_mm256_srl_epi16(__m256i a, __m128i b) {
  // CHECK-LABEL: test_mm256_srl_epi16
@ -1246,28 +1252,24 @@ __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
  // CHECK: call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
  return _mm_srlv_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v4si(_mm_srlv_epi32((__m128i)(__v4si){1, -2, 3, -4}, (__m128i)(__v4si){1, 2, 3, -4}), 0, 1073741823, 0, 0));

 __m256i test_mm256_srlv_epi32(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi32
  // CHECK: call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
  return _mm256_srlv_epi32(a, b);
 }
-TEST_CONSTEXPR(match_v8si(_mm256_srlv_epi32((__m256i)(__v8si){1, -2, 3, -4, 5, -6, 7, -8}, (__m256i)(__v8si){1, 2, 3, 4, -17, 31, 33, 29}), 0, 1073741823, 0, 268435455, 0, 1, 0, 7));

 __m128i test_mm_srlv_epi64(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_srlv_epi64
  // CHECK: call {{.*}}<2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
  return _mm_srlv_epi64(a, b);
 }
-TEST_CONSTEXPR(match_m128i(_mm_srlv_epi64((__m128i)(__v2di){1, -3}, (__m128i)(__v2di){8, 63}), 0, 1));

 __m256i test_mm256_srlv_epi64(__m256i a, __m256i b) {
  // CHECK-LABEL: test_mm256_srlv_epi64
  // CHECK: call {{.*}}<4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %{{.*}}, <4 x i64> %{{.*}})
  return _mm256_srlv_epi64(a, b);
 }
-TEST_CONSTEXPR(match_m256i(_mm256_srlv_epi64((__m256i)(__v4di){1, -2, 3, -4}, (__m256i)(__v4di){1, 2, 3, -4}), 0, 0x3FFFFFFFFFFFFFFFULL, 0, 0));

 __m256i test_mm256_stream_load_si256(__m256i const *a) {
  // CHECK-LABEL: test_mm256_stream_load_si256
--- a/compiler-rt/lib/asan/CMakeLists.txt
+++ b/compiler-rt/lib/asan/CMakeLists.txt
@ -106,12 +106,6 @@ if(MSVC)
 endif()
 set(ASAN_CFLAGS ${SANITIZER_COMMON_CFLAGS})

-# Win/ASan relies on the runtime functions being hotpatchable. See
-# https://github.com/llvm/llvm-project/pull/149444
-if(MSVC)
-  list(APPEND ASAN_CFLAGS /hotpatch)
-endif()
-
 append_list_if(MSVC /Zl ASAN_CFLAGS)

 set(ASAN_COMMON_DEFINITIONS "")
--- a/compiler-rt/lib/dfsan/dfsan.cpp
+++ b/compiler-rt/lib/dfsan/dfsan.cpp
@ -792,7 +792,7 @@ static void PrintNoOriginTrackingWarning() {

 static void PrintNoTaintWarning(const void *address) {
  Decorator d;
-  Printf("  %sDFSan: no tainted value at %x%s\n", d.Warning(), address,
+  Printf("  %sDFSan: no tainted value at %zx%s\n", d.Warning(), (uptr)address,
         d.Default());
 }

--- a/compiler-rt/lib/hwasan/hwasan.cpp
+++ b/compiler-rt/lib/hwasan/hwasan.cpp
@ -176,7 +176,7 @@ static void HwasanFormatMemoryUsage(InternalScopedString &s) {
      "HWASAN pid: %d rss: %zd threads: %zd stacks: %zd"
      " thr_aux: %zd stack_depot: %zd uniq_stacks: %zd"
      " heap: %zd",
-      internal_getpid(), GetRSS(), thread_stats.n_live_threads,
+      (int)internal_getpid(), GetRSS(), thread_stats.n_live_threads,
      thread_stats.total_stack_size,
      thread_stats.n_live_threads * thread_list.MemoryUsedPerThread(),
      sds.allocated, sds.n_uniq_ids, asc[AllocatorStatMapped]);
@ -692,7 +692,7 @@ void __hwasan_handle_longjmp(const void *sp_dst) {
        "WARNING: HWASan is ignoring requested __hwasan_handle_longjmp: "
        "stack top: %p; target %p; distance: %p (%zd)\n"
        "False positive error reports may follow\n",
-        (void *)sp, (void *)dst, dst - sp, dst - sp);
+        (void *)sp, (void *)dst, (void *)(dst - sp), dst - sp);
    return;
  }
  TagMemory(sp, dst - sp, 0);
--- a/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h
+++ b/compiler-rt/lib/hwasan/hwasan_malloc_bisect.h
@ -41,7 +41,7 @@ static inline bool malloc_bisect(StackTrace *stack, uptr orig_size) {
  if (h < left || h > right)
    return false;
  if (flags()->malloc_bisect_dump) {
-    Printf("[alloc] %u %zu\n", h, orig_size);
+    Printf("[alloc] %u %zu\n", (u32)h, orig_size);
    stack->Print();
  }
  return true;
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@ -306,8 +306,9 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
          "%p is located %zd bytes %s a %zd-byte local variable %s "
          "[%p,%p) "
          "in %s %s\n",
-          untagged_addr, offset, whence, local.size, local.name, best_beg,
-          best_beg + local.size, local.function_name, location.data());
+          (void *)untagged_addr, offset, whence, local.size, local.name,
+          (void *)best_beg, (void *)(best_beg + local.size),
+          local.function_name, location.data());
      location.clear();
      Printf("%s\n", d.Default());
    }
@ -738,8 +739,8 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
    Printf("%s", d.Location());
    Printf("%p is located %zd bytes %s a %zd-byte region [%p,%p)\n",
           untagged_addr, offset, whence,
-           candidate.heap.end - candidate.heap.begin, candidate.heap.begin,
-           candidate.heap.end);
+           candidate.heap.end - candidate.heap.begin,
+           (void *)candidate.heap.begin, (void *)candidate.heap.end);
    Printf("%s", d.Allocation());
    Printf("allocated by thread T%u here:\n", candidate.heap.thread_id);
    Printf("%s", d.Default());
@ -762,11 +763,11 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
      Printf(
          "%p is located %zd bytes %s a %zd-byte global variable "
          "%s [%p,%p) in %s\n",
-          untagged_addr,
+          (void *)untagged_addr,
          candidate.after ? untagged_addr - (info.start + info.size)
                          : info.start - untagged_addr,
          candidate.after ? "after" : "before", info.size, info.name,
-          info.start, info.start + info.size, module_name);
+          (void *)info.start, (void *)(info.start + info.size), module_name);
    } else {
      uptr size = GetGlobalSizeFromDescriptor(candidate.untagged_addr);
      if (size == 0)
@ -774,14 +775,14 @@ void BaseReport::PrintHeapOrGlobalCandidate() const {
        Printf(
            "%p is located %s a global variable in "
            "\n    #0 0x%x (%s+0x%x)\n",
-            untagged_addr, candidate.after ? "after" : "before",
-            candidate.untagged_addr, module_name, module_address);
+            (void *)untagged_addr, candidate.after ? "after" : "before",
+            (void *)candidate.untagged_addr, module_name, (u32)module_address);
      else
        Printf(
            "%p is located %s a %zd-byte global variable in "
            "\n    #0 0x%x (%s+0x%x)\n",
-            untagged_addr, candidate.after ? "after" : "before", size,
-            candidate.untagged_addr, module_name, module_address);
+            (void *)untagged_addr, candidate.after ? "after" : "before", size,
+            (void *)candidate.untagged_addr, module_name, (u32)module_address);
    }
    Printf("%s", d.Default());
  }
@ -792,8 +793,8 @@ void BaseReport::PrintAddressDescription() const {
  int num_descriptions_printed = 0;

  if (MemIsShadow(untagged_addr)) {
-    Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(), untagged_addr,
-           d.Default());
+    Printf("%s%p is HWAsan shadow memory.\n%s", d.Location(),
+           (void *)untagged_addr, d.Default());
    return;
  }

@ -802,7 +803,7 @@ void BaseReport::PrintAddressDescription() const {
    Printf(
        "%s[%p,%p) is a %s %s heap chunk; "
        "size: %zd offset: %zd\n%s",
-        d.Location(), heap.begin, heap.begin + heap.size,
+        d.Location(), (void *)heap.begin, (void *)(heap.begin + heap.size),
        heap.from_small_heap ? "small" : "large",
        heap.is_allocated ? "allocated" : "unallocated", heap.size,
        untagged_addr - heap.begin, d.Default());
@ -821,8 +822,8 @@ void BaseReport::PrintAddressDescription() const {
    Printf("%s", d.Error());
    Printf("\nCause: stack tag-mismatch\n");
    Printf("%s", d.Location());
-    Printf("Address %p is located in stack of thread T%zd\n", untagged_addr,
-           sa.thread_id());
+    Printf("Address %p is located in stack of thread T%zd\n",
+           (void *)untagged_addr, (ssize)sa.thread_id());
    Printf("%s", d.Default());
    announce_by_id(sa.thread_id());
    PrintStackAllocations(sa.get(), ptr_tag, untagged_addr);
@ -842,9 +843,9 @@ void BaseReport::PrintAddressDescription() const {
    Printf("\nCause: use-after-free\n");
    Printf("%s", d.Location());
    Printf("%p is located %zd bytes inside a %zd-byte region [%p,%p)\n",
-           untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
-           har.requested_size, UntagAddr(har.tagged_addr),
-           UntagAddr(har.tagged_addr) + har.requested_size);
+           (void *)untagged_addr, untagged_addr - UntagAddr(har.tagged_addr),
+           (ssize)har.requested_size, UntagAddr(har.tagged_addr),
+           (void *)(UntagAddr(har.tagged_addr) + har.requested_size));
    Printf("%s", d.Allocation());
    Printf("freed by thread T%u here:\n", ha.free_thread_id);
    Printf("%s", d.Default());
@ -858,7 +859,7 @@ void BaseReport::PrintAddressDescription() const {
    // Print a developer note: the index of this heap object
    // in the thread's deallocation ring buffer.
    Printf("hwasan_dev_note_heap_rb_distance: %zd %zd\n", ha.ring_index + 1,
-           flags()->heap_history_size);
+           (ssize)flags()->heap_history_size);
    Printf("hwasan_dev_note_num_matching_addrs: %zd\n", ha.num_matching_addrs);
    Printf("hwasan_dev_note_num_matching_addrs_4b: %zd\n",
           ha.num_matching_addrs_4b);
@ -915,10 +916,11 @@ InvalidFreeReport::~InvalidFreeReport() {
  const Thread *thread = GetCurrentThread();
  if (thread) {
    Report("ERROR: %s: %s on address %p at pc %p on thread T%zd\n",
-           SanitizerToolName, bug_type, untagged_addr, pc, thread->unique_id());
+           SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc,
+           (ssize)thread->unique_id());
  } else {
    Report("ERROR: %s: %s on address %p at pc %p on unknown thread\n",
-           SanitizerToolName, bug_type, untagged_addr, pc);
+           SanitizerToolName, bug_type, (void *)untagged_addr, (void *)pc);
  }
  Printf("%s", d.Access());
  if (shadow.addr) {
@ -967,7 +969,8 @@ TailOverwrittenReport::~TailOverwrittenReport() {
  Printf("%s", d.Error());
  const char *bug_type = "allocation-tail-overwritten";
  Report("ERROR: %s: %s; heap object [%p,%p) of size %zd\n", SanitizerToolName,
-         bug_type, untagged_addr, untagged_addr + orig_size, orig_size);
+         bug_type, (void *)untagged_addr, (void *)(untagged_addr + orig_size),
+         orig_size);
  Printf("\n%s", d.Default());
  Printf(
      "Stack of invalid access unknown. Issue detected at deallocation "
@ -1037,7 +1040,7 @@ TagMismatchReport::~TagMismatchReport() {
  uptr pc = GetTopPc(stack);
  Printf("%s", d.Error());
  Report("ERROR: %s: %s on address %p at pc %p\n", SanitizerToolName, bug_type,
-         untagged_addr, pc);
+         (void *)untagged_addr, (void *)pc);

  Thread *t = GetCurrentThread();

@ -1049,12 +1052,12 @@ TagMismatchReport::~TagMismatchReport() {
        GetShortTagCopy(MemToShadow(untagged_addr + mismatch_offset));
    Printf(
        "%s of size %zu at %p tags: %02x/%02x(%02x) (ptr/mem) in thread T%zd\n",
-        is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag,
-        mem_tag, short_tag, t->unique_id());
+        is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr,
+        ptr_tag, mem_tag, short_tag, (ssize)t->unique_id());
  } else {
    Printf("%s of size %zu at %p tags: %02x/%02x (ptr/mem) in thread T%zd\n",
-           is_store ? "WRITE" : "READ", access_size, untagged_addr, ptr_tag,
-           mem_tag, t->unique_id());
+           is_store ? "WRITE" : "READ", access_size, (void *)untagged_addr,
+           ptr_tag, mem_tag, (ssize)t->unique_id());
  }
  if (mismatch_offset)
    Printf("Invalid access starting at offset %zu\n", mismatch_offset);
@ -1093,7 +1096,7 @@ void ReportTagMismatch(StackTrace *stack, uptr tagged_addr, uptr access_size,
 // See the frame breakdown defined in __hwasan_tag_mismatch (from
 // hwasan_tag_mismatch_{aarch64,riscv64}.S).
 void ReportRegisters(const uptr *frame, uptr pc) {
-  Printf("\nRegisters where the failure occurred (pc %p):\n", pc);
+  Printf("\nRegisters where the failure occurred (pc %p):\n", (void *)pc);

  // We explicitly print a single line (4 registers/line) each iteration to
  // reduce the amount of logcat error messages printed. Each Printf() will
--- a/compiler-rt/lib/hwasan/hwasan_thread.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_thread.cpp
@ -173,9 +173,10 @@ uptr Thread::stack_size() {
 }

 void Thread::Print(const char *Prefix) {
-  Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix, unique_id_,
-         (void *)this, stack_bottom(), stack_top(),
-         stack_top() - stack_bottom(), tls_begin(), tls_end());
+  Printf("%sT%zd %p stack: [%p,%p) sz: %zd tls: [%p,%p)\n", Prefix,
+         (ssize_t)unique_id_, (void *)this, (void *)stack_bottom(),
+         (void *)stack_top(), stack_top() - stack_bottom(), (void *)tls_begin(),
+         (void *)tls_end());
 }

 static u32 xorshift(u32 state) {
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@ -806,7 +806,7 @@ static bool ReportUnsuspendedThreads(
      succeded = false;
      Report(
          "Running thread %zu was not suspended. False leaks are possible.\n",
-          os_id);
+          (usize)os_id);
    }
  }
  return succeded;
--- a/compiler-rt/lib/memprof/memprof_shadow_setup.cpp
+++ b/compiler-rt/lib/memprof/memprof_shadow_setup.cpp
@ -29,7 +29,7 @@ static void ProtectGap(uptr addr, uptr size) {
      Printf("protect_shadow_gap=0:"
             " not protecting shadow gap, allocating gap's shadow\n"
             "|| `[%p, %p]` || ShadowGap's shadow ||\n",
-             GapShadowBeg, GapShadowEnd);
+             (void *)GapShadowBeg, (void *)GapShadowEnd);
    ReserveShadowMemoryRange(GapShadowBeg, GapShadowEnd,
                             "unprotected gap shadow");
    return;
--- a/compiler-rt/lib/xray/xray_init.cpp
+++ b/compiler-rt/lib/xray/xray_init.cpp
@ -105,7 +105,7 @@ __xray_register_sleds(const XRaySledEntry *SledsBegin,
  }

  if (Verbosity())
-    Report("Registering %d new functions!\n", SledMap.Functions);
+    Report("Registering %d new functions!\n", (int)SledMap.Functions);

  {
    SpinMutexLock Guard(&XRayInstrMapMutex);
--- a/compiler-rt/lib/xray/xray_interface.cpp
+++ b/compiler-rt/lib/xray/xray_interface.cpp
@ -308,7 +308,8 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) {
    return XRayPatchingStatus::NOT_INITIALIZED;

  if (Verbosity())
-    Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries);
+    Report("Patching object %d with %d functions.\n", ObjId,
+           (int)InstrMap.Entries);

  // Check if the corresponding DSO has been unloaded.
  if (!InstrMap.Loaded) {
--- a/libc/config/gpu/nvptx/entrypoints.txt
+++ b/libc/config/gpu/nvptx/entrypoints.txt
@ -280,7 +280,6 @@ set(TARGET_LIBC_ENTRYPOINTS

 set(TARGET_LIBM_ENTRYPOINTS
    # math.h entrypoints
-    libc.src.math.acos
    libc.src.math.acosf
    libc.src.math.acoshf
    libc.src.math.asin
--- a/libc/include/math.yaml
+++ b/libc/include/math.yaml
@ -2432,6 +2432,14 @@ functions:
    return_type: double
    arguments:
      - type: double
+  - name: sincosf
+    standards:
+      - gnu
+    return_type: void
+    arguments:
+      - type: float
+      - type: float *
+      - type: float *
  - name: sinf
    standards:
      - stdc
@ -2445,22 +2453,6 @@ functions:
    arguments:
      - type: _Float16
    guard: LIBC_TYPES_HAS_FLOAT16
-  - name: sincos
-    standards:
-      - gnu
-    return_type: void
-    arguments:
-      - type: double
-      - type: double *
-      - type: double *
-  - name: sincosf
-    standards:
-      - gnu
-    return_type: void
-    arguments:
-      - type: float
-      - type: float *
-      - type: float *
  - name: sinhf
    standards:
      - stdc
--- a/libcxx/include/map
+++ b/libcxx/include/map
@ -978,11 +978,11 @@ public:

 #  ifndef _LIBCPP_CXX03_LANG

-  _LIBCPP_HIDE_FROM_ABI map(map&& __m) = default;
+  _LIBCPP_HIDE_FROM_ABI map(map&& __m) noexcept(is_nothrow_move_constructible<__base>::value) = default;

  _LIBCPP_HIDE_FROM_ABI map(map&& __m, const allocator_type& __a);

-  _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) = default;
+  _LIBCPP_HIDE_FROM_ABI map& operator=(map&& __m) noexcept(is_nothrow_move_assignable<__base>::value) = default;

  _LIBCPP_HIDE_FROM_ABI map(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
      : __tree_(__vc(__comp)) {
@ -1646,11 +1646,12 @@ public:

 #  ifndef _LIBCPP_CXX03_LANG

-  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) = default;
+  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m) noexcept(is_nothrow_move_constructible<__base>::value) = default;

  _LIBCPP_HIDE_FROM_ABI multimap(multimap&& __m, const allocator_type& __a);

-  _LIBCPP_HIDE_FROM_ABI multimap& operator=(multimap&& __m) = default;
+  _LIBCPP_HIDE_FROM_ABI multimap&
+  operator=(multimap&& __m) noexcept(is_nothrow_move_assignable<__base>::value) = default;

  _LIBCPP_HIDE_FROM_ABI multimap(initializer_list<value_type> __il, const key_compare& __comp = key_compare())
      : __tree_(__vc(__comp)) {
--- a/libcxx/include/set
+++ b/libcxx/include/set
@ -667,7 +667,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI set& operator=(const set& __s) = default;

 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI set(set&& __s) = default;
+  _LIBCPP_HIDE_FROM_ABI set(set&& __s) noexcept(is_nothrow_move_constructible<__base>::value) = default;
 #  endif // _LIBCPP_CXX03_LANG

  _LIBCPP_HIDE_FROM_ABI explicit set(const allocator_type& __a) : __tree_(__a) {}
@ -699,7 +699,10 @@ public:
    return *this;
  }

-  _LIBCPP_HIDE_FROM_ABI set& operator=(set&& __s) = default;
+  _LIBCPP_HIDE_FROM_ABI set& operator=(set&& __s) noexcept(is_nothrow_move_assignable<__base>::value) {
+    __tree_ = std::move(__s.__tree_);
+    return *this;
+  }
 #  endif // _LIBCPP_CXX03_LANG

  _LIBCPP_HIDE_FROM_ABI ~set() { static_assert(sizeof(std::__diagnose_non_const_comparator<_Key, _Compare>()), ""); }
@ -1123,7 +1126,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI multiset& operator=(const multiset& __s) = default;

 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) = default;
+  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s) noexcept(is_nothrow_move_constructible<__base>::value) = default;

  _LIBCPP_HIDE_FROM_ABI multiset(multiset&& __s, const allocator_type& __a);
 #  endif // _LIBCPP_CXX03_LANG
@ -1155,7 +1158,10 @@ public:
    return *this;
  }

-  _LIBCPP_HIDE_FROM_ABI multiset& operator=(multiset&& __s) = default;
+  _LIBCPP_HIDE_FROM_ABI multiset& operator=(multiset&& __s) _NOEXCEPT_(is_nothrow_move_assignable<__base>::value) {
+    __tree_ = std::move(__s.__tree_);
+    return *this;
+  }
 #  endif // _LIBCPP_CXX03_LANG

  _LIBCPP_HIDE_FROM_ABI ~multiset() {
--- a/libcxx/include/unordered_map
+++ b/libcxx/include/unordered_map
@ -1049,7 +1049,8 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_map(const unordered_map& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_map(const unordered_map& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_map(unordered_map&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_map(unordered_map&& __u)
+      _NOEXCEPT_(is_nothrow_move_constructible<__table>::value) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_map(unordered_map&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_map(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI
@ -1101,7 +1102,8 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(const unordered_map& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(unordered_map&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(unordered_map&& __u)
+      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_map& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

@ -1821,7 +1823,8 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(const unordered_multimap& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(const unordered_multimap& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multimap(unordered_multimap&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multimap(unordered_multimap&& __u)
+      _NOEXCEPT_(is_nothrow_move_constructible<__table>::value) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(unordered_multimap&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI unordered_multimap(
@ -1873,7 +1876,8 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(const unordered_multimap& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(unordered_multimap&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(unordered_multimap&& __u)
+      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multimap& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@ -706,7 +706,7 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_set(const unordered_set& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_set(const unordered_set& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_set(unordered_set&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_set(unordered_set&& __u) _NOEXCEPT_(is_nothrow_move_constructible<__table>::value);
  _LIBCPP_HIDE_FROM_ABI unordered_set(unordered_set&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_set(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI
@ -735,7 +735,8 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(const unordered_set& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(unordered_set&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(unordered_set&& __u)
+      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_set& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

@ -1075,6 +1076,11 @@ unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(const unordered_set&

 #  ifndef _LIBCPP_CXX03_LANG

+template <class _Value, class _Hash, class _Pred, class _Alloc>
+inline unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(unordered_set&& __u)
+    _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
+    : __table_(std::move(__u.__table_)) {}
+
 template <class _Value, class _Hash, class _Pred, class _Alloc>
 unordered_set<_Value, _Hash, _Pred, _Alloc>::unordered_set(unordered_set&& __u, const allocator_type& __a)
    : __table_(std::move(__u.__table_), __a) {
@ -1288,7 +1294,8 @@ public:
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(const unordered_multiset& __u) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(const unordered_multiset& __u, const allocator_type& __a);
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multiset(unordered_multiset&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multiset(unordered_multiset&& __u)
+      _NOEXCEPT_(is_nothrow_move_constructible<__table>::value);
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(unordered_multiset&& __u, const allocator_type& __a);
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(initializer_list<value_type> __il);
  _LIBCPP_HIDE_FROM_ABI unordered_multiset(
@ -1317,7 +1324,8 @@ public:

  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(const unordered_multiset& __u) = default;
 #  ifndef _LIBCPP_CXX03_LANG
-  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(unordered_multiset&& __u) = default;
+  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(unordered_multiset&& __u)
+      _NOEXCEPT_(is_nothrow_move_assignable<__table>::value) = default;
  _LIBCPP_HIDE_FROM_ABI unordered_multiset& operator=(initializer_list<value_type> __il);
 #  endif // _LIBCPP_CXX03_LANG

@ -1667,6 +1675,11 @@ unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(

 #  ifndef _LIBCPP_CXX03_LANG

+template <class _Value, class _Hash, class _Pred, class _Alloc>
+inline unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(unordered_multiset&& __u)
+    _NOEXCEPT_(is_nothrow_move_constructible<__table>::value)
+    : __table_(std::move(__u.__table_)) {}
+
 template <class _Value, class _Hash, class _Pred, class _Alloc>
 unordered_multiset<_Value, _Hash, _Pred, _Alloc>::unordered_multiset(
    unordered_multiset&& __u, const allocator_type& __a)
--- a/lld/COFF/SymbolTable.cpp
+++ b/lld/COFF/SymbolTable.cpp
@ -1399,7 +1399,6 @@ void SymbolTable::resolveAlternateNames() {
      auto toUndef = dyn_cast<Undefined>(toSym);
      if (toUndef && (!toUndef->weakAlias || toUndef->isAntiDep))
        continue;
-      toSym->isUsedInRegularObj = true;
      if (toSym->isLazy())
        forceLazy(toSym);
      u->setWeakAlias(toSym);
--- a/lld/test/COFF/alternatename-lto.ll
+++ b/lld/test/COFF/alternatename-lto.ll
@ -1,25 +0,0 @@
-; REQUIRES: x86
-; RUN: mkdir -p %t.dir
-; RUN: llvm-as -o %t.obj %s
-; RUN: lld-link -out:%t.dll -dll -noentry %t.obj -export:test
-
-target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-windows-msvc19.33.0"
-
-$alt = comdat any
-
-@alt = weak_odr dso_local global i32 0, comdat, align 4
-@ext = external dso_local global i32, align 4
-
-; Function Attrs: noinline nounwind optnone uwtable
-define dso_local i32 @test() #0 {
-entry:
-  %0 = load i32, ptr @ext, align 4
-  ret i32 %0
-}
-
-attributes #0 = { noinline nounwind optnone uwtable }
-
-!llvm.linker.options = !{!0}
-
-!0 = !{!"/alternatename:ext=alt"}
--- a/lldb/source/Core/Value.cpp
+++ b/lldb/source/Core/Value.cpp
@ -347,9 +347,6 @@ Status Value::GetValueAsData(ExecutionContext *exe_ctx, DataExtractor &data,
    else
      data.SetAddressByteSize(sizeof(void *));

-    if (!type_size)
-      return Status::FromErrorString("type does not have a size");
-
    uint32_t result_byte_size = *type_size;
    if (m_value.GetData(data, result_byte_size))
      return error; // Success;
--- a/lldb/unittests/Core/CMakeLists.txt
+++ b/lldb/unittests/Core/CMakeLists.txt
@ -15,7 +15,6 @@ add_lldb_unittest(LLDBCoreTests
  SourceManagerTest.cpp
  TelemetryTest.cpp
  UniqueCStringMapTest.cpp
-  Value.cpp

  LINK_COMPONENTS
    Support
--- a/lldb/unittests/Core/Value.cpp
+++ b/lldb/unittests/Core/Value.cpp
@ -1,39 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "lldb/Core/Value.h"
-#include "Plugins/Platform/MacOSX/PlatformMacOSX.h"
-#include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
-#include "TestingSupport/SubsystemRAII.h"
-#include "TestingSupport/Symbol/ClangTestUtils.h"
-
-#include "lldb/Utility/DataExtractor.h"
-
-#include "gtest/gtest.h"
-
-using namespace lldb_private;
-using namespace lldb_private::clang_utils;
-
-TEST(ValueTest, GetValueAsData) {
-  SubsystemRAII<FileSystem, HostInfo, PlatformMacOSX> subsystems;
-  auto holder = std::make_unique<clang_utils::TypeSystemClangHolder>("test");
-  auto *clang = holder->GetAST();
-
-  Value v(Scalar(42));
-  DataExtractor extractor;
-
-  // no compiler type
-  Status status = v.GetValueAsData(nullptr, extractor, nullptr);
-  ASSERT_TRUE(status.Fail());
-
-  // with compiler type
-  v.SetCompilerType(clang->GetBasicType(lldb::BasicType::eBasicTypeChar));
-
-  status = v.GetValueAsData(nullptr, extractor, nullptr);
-  ASSERT_TRUE(status.Success());
-}
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@ -268,6 +268,7 @@ public:
    CmpArithIntrinsic, // Use a target-specific intrinsic for special compare
                       // operations; used by X86.
    Expand,            // Generic expansion in terms of other atomic operations.
+    CustomExpand,      // Custom target-specific expansion using TLI hooks.

    // Rewrite to a non-atomic form for use in a known non-preemptible
    // environment.
@ -2273,6 +2274,18 @@ public:
        "Generic atomicrmw expansion unimplemented on this target");
  }

+  /// Perform a atomic store using a target-specific way.
+  virtual void emitExpandAtomicStore(StoreInst *SI) const {
+    llvm_unreachable(
+        "Generic atomic store expansion unimplemented on this target");
+  }
+
+  /// Perform a atomic load using a target-specific way.
+  virtual void emitExpandAtomicLoad(LoadInst *LI) const {
+    llvm_unreachable(
+        "Generic atomic load expansion unimplemented on this target");
+  }
+
  /// Perform a cmpxchg expansion using a target-specific method.
  virtual void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
    llvm_unreachable("Generic cmpxchg expansion unimplemented on this target");
@ -2377,8 +2390,8 @@ public:
  }

  /// Returns how the given (atomic) store should be expanded by the IR-level
-  /// AtomicExpand pass into. For instance AtomicExpansionKind::Expand will try
-  /// to use an atomicrmw xchg.
+  /// AtomicExpand pass into. For instance AtomicExpansionKind::CustomExpand
+  /// will try to use an atomicrmw xchg.
  virtual AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const {
    return AtomicExpansionKind::None;
  }
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@ -84,7 +84,7 @@ private:
  bool expandAtomicLoadToCmpXchg(LoadInst *LI);
  StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
  bool tryExpandAtomicStore(StoreInst *SI);
-  void expandAtomicStore(StoreInst *SI);
+  void expandAtomicStoreToXChg(StoreInst *SI);
  bool tryExpandAtomicRMW(AtomicRMWInst *AI);
  AtomicRMWInst *convertAtomicXchgToIntegerType(AtomicRMWInst *RMWI);
  Value *
@ -537,6 +537,9 @@ bool AtomicExpandImpl::tryExpandAtomicLoad(LoadInst *LI) {
  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
    LI->setAtomic(AtomicOrdering::NotAtomic);
    return true;
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
+    TLI->emitExpandAtomicLoad(LI);
+    return true;
  default:
    llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
  }
@ -546,8 +549,11 @@ bool AtomicExpandImpl::tryExpandAtomicStore(StoreInst *SI) {
  switch (TLI->shouldExpandAtomicStoreInIR(SI)) {
  case TargetLoweringBase::AtomicExpansionKind::None:
    return false;
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
+    TLI->emitExpandAtomicStore(SI);
+    return true;
  case TargetLoweringBase::AtomicExpansionKind::Expand:
-    expandAtomicStore(SI);
+    expandAtomicStoreToXChg(SI);
    return true;
  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
    SI->setAtomic(AtomicOrdering::NotAtomic);
@ -620,7 +626,7 @@ StoreInst *AtomicExpandImpl::convertAtomicStoreToIntegerType(StoreInst *SI) {
  return NewSI;
 }

-void AtomicExpandImpl::expandAtomicStore(StoreInst *SI) {
+void AtomicExpandImpl::expandAtomicStoreToXChg(StoreInst *SI) {
  // This function is only called on atomic stores that are too large to be
  // atomic if implemented as a native store. So we replace them by an
  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
@ -741,7 +747,7 @@ bool AtomicExpandImpl::tryExpandAtomicRMW(AtomicRMWInst *AI) {
  }
  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
    return lowerAtomicRMWInst(AI);
-  case TargetLoweringBase::AtomicExpansionKind::Expand:
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand:
    TLI->emitExpandAtomicRMW(AI);
    return true;
  default:
@ -1695,7 +1701,7 @@ bool AtomicExpandImpl::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
    return true;
  case TargetLoweringBase::AtomicExpansionKind::NotAtomic:
    return lowerAtomicCmpXchgInst(CI);
-  case TargetLoweringBase::AtomicExpansionKind::Expand: {
+  case TargetLoweringBase::AtomicExpansionKind::CustomExpand: {
    TLI->emitExpandAtomicCmpXchg(CI);
    return true;
  }
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -18983,9 +18983,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
    // single-step fp_round we want to fold to.
    // In other words, double rounding isn't the same as rounding.
    // Also, this is a value preserving truncation iff both fp_round's are.
-    if ((N->getFlags().hasAllowContract() &&
-         N0->getFlags().hasAllowContract()) ||
-        N0IsTrunc)
+    if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc)
      return DAG.getNode(
          ISD::FP_ROUND, DL, VT, N0.getOperand(0),
          DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL, /*isTarget=*/true));
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -18,7 +18,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@ -27,7 +26,6 @@
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/KnownFPClass.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -27,7 +27,6 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"

 #ifdef EXPENSIVE_CHECKS
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -6135,19 +6135,6 @@ unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
  }
 }

-bool AMDGPUTargetLowering::canCreateUndefOrPoisonForTargetNode(
-    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
-    bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
-  unsigned Opcode = Op.getOpcode();
-  switch (Opcode) {
-  case AMDGPUISD::BFE_I32:
-  case AMDGPUISD::BFE_U32:
-    return false;
-  }
-  return TargetLowering::canCreateUndefOrPoisonForTargetNode(
-      Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
-}
-
 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(
    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
    unsigned Depth) const {
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -323,12 +323,6 @@ public:
                                            const MachineRegisterInfo &MRI,
                                            unsigned Depth = 0) const override;

-  bool canCreateUndefOrPoisonForTargetNode(SDValue Op,
-                                           const APInt &DemandedElts,
-                                           const SelectionDAG &DAG,
-                                           bool PoisonOnly, bool ConsiderFlags,
-                                           unsigned Depth) const override;
-
  bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts,
                                    const SelectionDAG &DAG, bool SNaN = false,
                                    unsigned Depth = 0) const override;
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@ -21,7 +21,6 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/Local.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@ -225,7 +225,6 @@
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstSimplifyFolder.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/Utils/Local.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/AttributeMask.h"
@ -244,7 +243,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ReplaceConstant.h"
 #include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/AMDGPUAddrSpace.h"
 #include "llvm/Support/Alignment.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@ -190,14 +190,12 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/ReplaceConstant.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.h
@ -22,6 +22,7 @@
 namespace llvm {
 class AsmPrinter;
 class MCContext;
+} // namespace llvm

 class AMDGPUMCInstLower {
  MCContext &Ctx;
@ -65,5 +66,4 @@ static inline const MCExpr *lowerAddrSpaceCast(const TargetMachine &TM,
  return nullptr;
 }
 } // namespace
-} // namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUMCINSTLOWER_H
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -90,7 +90,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Passes/CodeGenPassBuilder.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/FormatVariadic.h"
@ -126,44 +125,6 @@ using namespace llvm;
 using namespace llvm::PatternMatch;

 namespace {
-//===----------------------------------------------------------------------===//
-// AMDGPU CodeGen Pass Builder interface.
-//===----------------------------------------------------------------------===//
-
-class AMDGPUCodeGenPassBuilder
-    : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
-  using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
-
-public:
-  AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
-                           const CGPassBuilderOption &Opts,
-                           PassInstrumentationCallbacks *PIC);
-
-  void addIRPasses(AddIRPass &) const;
-  void addCodeGenPrepare(AddIRPass &) const;
-  void addPreISel(AddIRPass &addPass) const;
-  void addILPOpts(AddMachinePass &) const;
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
-  Error addInstSelector(AddMachinePass &) const;
-  void addPreRewrite(AddMachinePass &) const;
-  void addMachineSSAOptimization(AddMachinePass &) const;
-  void addPostRegAlloc(AddMachinePass &) const;
-  void addPreEmitPass(AddMachinePass &) const;
-  void addPreEmitRegAlloc(AddMachinePass &) const;
-  Error addRegAssignmentOptimized(AddMachinePass &) const;
-  void addPreRegAlloc(AddMachinePass &) const;
-  void addOptimizedRegAlloc(AddMachinePass &) const;
-  void addPreSched2(AddMachinePass &) const;
-
-  /// Check if a pass is enabled given \p Opt option. The option always
-  /// overrides defaults if explicitly used. Otherwise its default will be used
-  /// given that a pass shall work at an optimization \p Level minimum.
-  bool isPassEnabled(const cl::opt<bool> &Opt,
-                     CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
-  void addEarlyCSEOrGVNPass(AddIRPass &) const;
-  void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
-};
-
 class SGPRRegisterRegAlloc : public RegisterRegAllocBase<SGPRRegisterRegAlloc> {
 public:
  SGPRRegisterRegAlloc(const char *N, const char *D, FunctionPassCtor C)
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@ -18,6 +18,7 @@
 #include "llvm/CodeGen/CodeGenTargetMachineImpl.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
 #include <optional>
 #include <utility>

@ -157,6 +158,44 @@ public:
  }
 };

+//===----------------------------------------------------------------------===//
+// AMDGPU CodeGen Pass Builder interface.
+//===----------------------------------------------------------------------===//
+
+class AMDGPUCodeGenPassBuilder
+    : public CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine> {
+  using Base = CodeGenPassBuilder<AMDGPUCodeGenPassBuilder, GCNTargetMachine>;
+
+public:
+  AMDGPUCodeGenPassBuilder(GCNTargetMachine &TM,
+                           const CGPassBuilderOption &Opts,
+                           PassInstrumentationCallbacks *PIC);
+
+  void addIRPasses(AddIRPass &) const;
+  void addCodeGenPrepare(AddIRPass &) const;
+  void addPreISel(AddIRPass &addPass) const;
+  void addILPOpts(AddMachinePass &) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+  void addPreRewrite(AddMachinePass &) const;
+  void addMachineSSAOptimization(AddMachinePass &) const;
+  void addPostRegAlloc(AddMachinePass &) const;
+  void addPreEmitPass(AddMachinePass &) const;
+  void addPreEmitRegAlloc(AddMachinePass &) const;
+  Error addRegAssignmentOptimized(AddMachinePass &) const;
+  void addPreRegAlloc(AddMachinePass &) const;
+  void addOptimizedRegAlloc(AddMachinePass &) const;
+  void addPreSched2(AddMachinePass &) const;
+
+  /// Check if a pass is enabled given \p Opt option. The option always
+  /// overrides defaults if explicitly used. Otherwise its default will be used
+  /// given that a pass shall work at an optimization \p Level minimum.
+  bool isPassEnabled(const cl::opt<bool> &Opt,
+                     CodeGenOptLevel Level = CodeGenOptLevel::Default) const;
+  void addEarlyCSEOrGVNPass(AddIRPass &) const;
+  void addStraightLineScalarOptimizationPasses(AddIRPass &) const;
+};
+
 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETMACHINE_H
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@ -21,7 +21,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/Passes/CodeGenPassBuilder.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MCInstLower.cpp
@ -20,8 +20,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"

-using namespace llvm;
-
 namespace {
 class R600MCInstLower : public AMDGPUMCInstLower {
 public:
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@ -19,7 +19,6 @@
 #include "R600MachineFunctionInfo.h"
 #include "R600MachineScheduler.h"
 #include "R600TargetTransformInfo.h"
-#include "llvm/Passes/CodeGenPassBuilder.h"
 #include "llvm/Transforms/Scalar.h"
 #include <optional>

@ -47,21 +46,6 @@ static MachineSchedRegistry R600SchedRegistry("r600",
                                              "Run R600's custom scheduler",
                                              createR600MachineScheduler);

-//===----------------------------------------------------------------------===//
-// R600 CodeGen Pass Builder interface.
-//===----------------------------------------------------------------------===//
-
-class R600CodeGenPassBuilder
-    : public CodeGenPassBuilder<R600CodeGenPassBuilder, R600TargetMachine> {
-public:
-  R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
-                         PassInstrumentationCallbacks *PIC);
-
-  void addPreISel(AddIRPass &addPass) const;
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
-  Error addInstSelector(AddMachinePass &) const;
-};
-
 //===----------------------------------------------------------------------===//
 // R600 Target Machine (R600 -> Cayman)
 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@ -57,6 +57,21 @@ public:
  createMachineScheduler(MachineSchedContext *C) const override;
 };

+//===----------------------------------------------------------------------===//
+// R600 CodeGen Pass Builder interface.
+//===----------------------------------------------------------------------===//
+
+class R600CodeGenPassBuilder
+    : public CodeGenPassBuilder<R600CodeGenPassBuilder, R600TargetMachine> {
+public:
+  R600CodeGenPassBuilder(R600TargetMachine &TM, const CGPassBuilderOption &Opts,
+                         PassInstrumentationCallbacks *PIC);
+
+  void addPreISel(AddIRPass &addPass) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+};
+
 } // end namespace llvm

 #endif // LLVM_LIB_TARGET_AMDGPU_R600TARGETMACHINE_H
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@ -16,14 +16,12 @@
 #include "GCNSubtarget.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@ -17808,11 +17808,19 @@ static bool flatInstrMayAccessPrivate(const Instruction *I) {
         !AMDGPU::hasValueInRangeLikeMetadata(*MD, AMDGPUAS::PRIVATE_ADDRESS);
 }

+static TargetLowering::AtomicExpansionKind
+getPrivateAtomicExpansionKind(const GCNSubtarget &STI) {
+  // For GAS, lower to flat atomic.
+  return STI.hasGloballyAddressableScratch()
+             ? TargetLowering::AtomicExpansionKind::Expand
+             : TargetLowering::AtomicExpansionKind::NotAtomic;
+}
+
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
  unsigned AS = RMW->getPointerAddressSpace();
  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());

  // 64-bit flat atomics that dynamically reside in private memory will silently
  // be dropped.
@ -17823,7 +17831,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
  if (AS == AMDGPUAS::FLAT_ADDRESS &&
      DL.getTypeSizeInBits(RMW->getType()) == 64 &&
      flatInstrMayAccessPrivate(RMW))
-    return AtomicExpansionKind::Expand;
+    return AtomicExpansionKind::CustomExpand;

  auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
    OptimizationRemarkEmitter ORE(RMW->getFunction());
@ -17898,7 +17906,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
        // does. InstCombine transforms these with 0 to or, so undo that.
        if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
            ConstVal && ConstVal->isNullValue())
-          return AtomicExpansionKind::Expand;
+          return AtomicExpansionKind::CustomExpand;
      }

      // If the allocation could be in remote, fine-grained memory, the rmw
@ -18027,9 +18035,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
        // fadd.
        if (Subtarget->hasLDSFPAtomicAddF32()) {
          if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
-            return AtomicExpansionKind::Expand;
+            return AtomicExpansionKind::CustomExpand;
          if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
-            return AtomicExpansionKind::Expand;
+            return AtomicExpansionKind::CustomExpand;
        }
      }
    }
@ -18083,14 +18091,14 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
  return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
             : AtomicExpansionKind::None;
 }

 TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
  return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
-             ? AtomicExpansionKind::NotAtomic
+             ? getPrivateAtomicExpansionKind(*getSubtarget())
             : AtomicExpansionKind::None;
 }

@ -18098,7 +18106,7 @@ TargetLowering::AtomicExpansionKind
 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
  unsigned AddrSpace = CmpX->getPointerAddressSpace();
  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
-    return AtomicExpansionKind::NotAtomic;
+    return getPrivateAtomicExpansionKind(*getSubtarget());

  if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
    return AtomicExpansionKind::None;
@ -18109,7 +18117,7 @@ SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {

  // If a 64-bit flat atomic may alias private, we need to avoid using the
  // atomic in the private case.
-  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::Expand
+  return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
                                           : AtomicExpansionKind::None;
 }

@ -18468,9 +18476,24 @@ void SITargetLowering::emitExpandAtomicAddrSpacePredicate(
  Builder.CreateBr(ExitBB);
 }

+static void convertScratchAtomicToFlatAtomic(Instruction *I,
+                                             unsigned PtrOpIdx) {
+  Value *PtrOp = I->getOperand(PtrOpIdx);
+  assert(PtrOp->getType()->getPointerAddressSpace() ==
+         AMDGPUAS::PRIVATE_ADDRESS);
+
+  Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
+  Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
+                                              I->getIterator());
+  I->setOperand(PtrOpIdx, ASCast);
+}
+
 void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
  AtomicRMWInst::BinOp Op = AI->getOperation();

+  if (AI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(AI, AI->getPointerOperandIndex());
+
  if (Op == AtomicRMWInst::Sub || Op == AtomicRMWInst::Or ||
      Op == AtomicRMWInst::Xor) {
    if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
@ -18493,9 +18516,28 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
 }

 void SITargetLowering::emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const {
+  if (CI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(CI, CI->getPointerOperandIndex());
+
  emitExpandAtomicAddrSpacePredicate(CI);
 }

+void SITargetLowering::emitExpandAtomicLoad(LoadInst *LI) const {
+  if (LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(LI, LI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
+}
+
+void SITargetLowering::emitExpandAtomicStore(StoreInst *SI) const {
+  if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
+    return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
+
+  llvm_unreachable(
+      "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
+}
+
 LoadInst *
 SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
  IRBuilder<> Builder(AI);
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@ -562,6 +562,8 @@ public:
  void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const;
  void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
  void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override;
+  void emitExpandAtomicLoad(LoadInst *LI) const override;
+  void emitExpandAtomicStore(StoreInst *SI) const override;

  LoadInst *
  lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@ -32,15 +32,11 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/TargetParser/TargetParser.h"
-
 using namespace llvm;

 #define DEBUG_TYPE "si-insert-waitcnts"
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@ -17,7 +17,6 @@
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachinePassManager.h"
-#include "llvm/InitializePasses.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@ -24,7 +24,6 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/InitializePasses.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@ -80,7 +80,6 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/IR/Dominators.h"

 using namespace llvm;

--- a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/InitializePasses.h"

 using namespace llvm;

--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@ -7893,7 +7893,7 @@ LoongArchTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
    if (Size < 32 && (AI->getOperation() == AtomicRMWInst::And ||
                      AI->getOperation() == AtomicRMWInst::Or ||
                      AI->getOperation() == AtomicRMWInst::Xor))
-      return AtomicExpansionKind::Expand;
+      return AtomicExpansionKind::CustomExpand;
    if (AI->getOperation() == AtomicRMWInst::Nand || Size < 32)
      return AtomicExpansionKind::CmpXChg;
  }
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@ -2073,23 +2073,15 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
  }

  // Replace (inttoptr (add (ptrtoint %Base), %Offset)) with
-  // (getelementptr i8, %Base, %Offset) if the pointer is only used as integer
-  // value.
+  // (getelementptr i8, %Base, %Offset) if all users are ICmps.
  Value *Base;
  Value *Offset;
-  auto UsesPointerAsInt = [](User *U) {
-    if (isa<ICmpInst, PtrToIntInst>(U))
-      return true;
-    if (auto *P = dyn_cast<PHINode>(U))
-      return P->hasOneUse() && isa<ICmpInst, PtrToIntInst>(*P->user_begin());
-    return false;
-  };
  if (match(CI.getOperand(0),
            m_OneUse(m_c_Add(m_PtrToIntSameSize(DL, m_Value(Base)),
                             m_Value(Offset)))) &&
      CI.getType()->getPointerAddressSpace() ==
          Base->getType()->getPointerAddressSpace() &&
-      all_of(CI.users(), UsesPointerAsInt)) {
+      all_of(CI.users(), IsaPred<ICmpInst>)) {
    return GetElementPtrInst::Create(Builder.getInt8Ty(), Base, Offset);
  }

--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@ -642,13 +642,6 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {

    Instruction *Ext = I->clone();
    Ext->setOperand(0, Current);
-    // In ConstantOffsetExtractor::find we do not analyze nuw/nsw for trunc, so
-    // we assume that it is ok to redistribute trunc over add/sub/or. But for
-    // example (add (trunc nuw A), (trunc nuw B)) is more poisonous than (trunc
-    // nuw (add A, B))). To make such redistributions legal we drop all the
-    // poison generating flags from cloned trunc instructions here.
-    if (isa<TruncInst>(Ext))
-      Ext->dropPoisonGeneratingFlags();
    Ext->insertBefore(*IP->getParent(), IP);
    Current = Ext;
  }
--- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@ -153,7 +153,11 @@ template <typename LTy, typename RTy> struct match_combine_or {
  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) {}

  template <typename ITy> bool match(ITy *V) const {
-    return L.match(V) || R.match(V);
+    if (L.match(V))
+      return true;
+    if (R.match(V))
+      return true;
+    return false;
  }
 };

--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.cvt = fptrunc float %a to bfloat
@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.abs = call float @llvm.fabs.f32(float %a)
@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
 ; GFX1250:       ; %bb.0: ; %entry
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.neg = fneg float %a
@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
 ; GFX1250-NEXT:    s_or_b32 vcc_lo, vcc_lo, s0
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.cvt = fptrunc double %a to bfloat
@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.neg = fneg double %a
@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc_lo
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
-; GFX1250-NEXT:    flat_store_b16 v[2:3], v0 scope:SCOPE_SE
+; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
 ; GFX1250-NEXT:    s_endpgm
 entry:
  %a.abs = call double @llvm.fabs.f64(double %a)
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@ -1582,22 +1582,28 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11]
 ; SI-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s0, v4
-; SI-NEXT:    v_readfirstlane_b32 s1, v5
-; SI-NEXT:    s_bfe_u32 s2, s1, 0xb0014
-; SI-NEXT:    s_add_i32 s8, s2, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_andn2_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT:    s_and_b32 s9, s1, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s8, 0
-; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cselect_b32 s3, s9, s3
-; SI-NEXT:    s_cmp_gt_i32 s8, 51
-; SI-NEXT:    s_cselect_b32 s1, s1, s3
-; SI-NEXT:    s_cselect_b32 s0, s0, s2
-; SI-NEXT:    v_fma_f64 v[0:1], -s[0:1], v[2:3], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s2, v5
+; SI-NEXT:    s_bfe_u32 s0, s2, 0xb0014
+; SI-NEXT:    s_add_i32 s3, s0, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s1, 0xfffff
+; SI-NEXT:    s_mov_b32 s0, s6
+; SI-NEXT:    s_lshr_b64 s[0:1], s[0:1], s3
+; SI-NEXT:    v_not_b32_e32 v6, s0
+; SI-NEXT:    v_and_b32_e32 v6, v4, v6
+; SI-NEXT:    v_not_b32_e32 v7, s1
+; SI-NEXT:    v_and_b32_e32 v5, v5, v7
+; SI-NEXT:    s_and_b32 s0, s2, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s3, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v7, s0
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_cmp_gt_i32 s3, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v7, s2
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@ -1853,22 +1859,28 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
 ; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_readfirstlane_b32 s4, v4
-; SI-NEXT:    v_readfirstlane_b32 s5, v5
-; SI-NEXT:    s_bfe_u32 s6, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s8, s6, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s7, 0xfffff
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
-; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[6:7]
-; SI-NEXT:    s_and_b32 s9, s5, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s8, 0
-; SI-NEXT:    s_cselect_b32 s6, 0, s6
-; SI-NEXT:    s_cselect_b32 s7, s9, s7
-; SI-NEXT:    s_cmp_gt_i32 s8, 51
-; SI-NEXT:    s_cselect_b32 s5, s5, s7
-; SI-NEXT:    s_cselect_b32 s4, s4, s6
-; SI-NEXT:    v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s5, 0xfffff
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; SI-NEXT:    v_not_b32_e32 v6, s4
+; SI-NEXT:    v_and_b32_e32 v6, v4, v6
+; SI-NEXT:    v_not_b32_e32 v7, s5
+; SI-NEXT:    v_and_b32_e32 v5, v5, v7
+; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v7, s4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@ -2097,22 +2109,28 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    v_mul_f64 v[6:7], v[0:1], v[4:5]
 ; SI-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
 ; SI-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
-; SI-NEXT:    v_readfirstlane_b32 s4, v4
-; SI-NEXT:    v_readfirstlane_b32 s5, v5
-; SI-NEXT:    s_bfe_u32 s6, s5, 0xb0014
-; SI-NEXT:    s_add_i32 s8, s6, 0xfffffc01
-; SI-NEXT:    s_mov_b32 s7, 0xfffff
-; SI-NEXT:    s_mov_b32 s6, s2
-; SI-NEXT:    s_lshr_b64 s[6:7], s[6:7], s8
-; SI-NEXT:    s_andn2_b64 s[6:7], s[4:5], s[6:7]
-; SI-NEXT:    s_and_b32 s9, s5, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s8, 0
-; SI-NEXT:    s_cselect_b32 s6, 0, s6
-; SI-NEXT:    s_cselect_b32 s7, s9, s7
-; SI-NEXT:    s_cmp_gt_i32 s8, 51
-; SI-NEXT:    s_cselect_b32 s5, s5, s7
-; SI-NEXT:    s_cselect_b32 s4, s4, s6
-; SI-NEXT:    v_fma_f64 v[0:1], -s[4:5], v[2:3], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s6, v5
+; SI-NEXT:    s_bfe_u32 s4, s6, 0xb0014
+; SI-NEXT:    s_add_i32 s7, s4, 0xfffffc01
+; SI-NEXT:    s_mov_b32 s5, 0xfffff
+; SI-NEXT:    s_mov_b32 s4, s2
+; SI-NEXT:    s_lshr_b64 s[4:5], s[4:5], s7
+; SI-NEXT:    v_not_b32_e32 v6, s4
+; SI-NEXT:    v_and_b32_e32 v6, v4, v6
+; SI-NEXT:    v_not_b32_e32 v7, s5
+; SI-NEXT:    v_and_b32_e32 v5, v5, v7
+; SI-NEXT:    s_and_b32 s4, s6, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s7, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v6, v6, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v7, s4
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    s_cmp_gt_i32 s7, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v7, s6
+; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@ -5233,22 +5251,27 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15]
 ; SI-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
-; SI-NEXT:    v_readfirstlane_b32 s0, v8
-; SI-NEXT:    v_readfirstlane_b32 s1, v9
-; SI-NEXT:    s_bfe_u32 s2, s1, 0xb0014
-; SI-NEXT:    s_add_i32 s10, s2, 0xfffffc01
+; SI-NEXT:    v_readfirstlane_b32 s8, v9
+; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
+; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_lshr_b64 s[8:9], s[2:3], s10
-; SI-NEXT:    s_andn2_b64 s[8:9], s[0:1], s[8:9]
-; SI-NEXT:    s_and_b32 s11, s1, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s10, 0
-; SI-NEXT:    s_cselect_b32 s8, 0, s8
-; SI-NEXT:    s_cselect_b32 s9, s11, s9
-; SI-NEXT:    s_cmp_gt_i32 s10, 51
-; SI-NEXT:    s_cselect_b32 s1, s1, s9
-; SI-NEXT:    s_cselect_b32 s0, s0, s8
-; SI-NEXT:    v_fma_f64 v[2:3], -s[0:1], v[6:7], v[2:3]
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
+; SI-NEXT:    v_not_b32_e32 v10, s0
+; SI-NEXT:    v_and_b32_e32 v10, v8, v10
+; SI-NEXT:    v_not_b32_e32 v11, s1
+; SI-NEXT:    v_and_b32_e32 v9, v9, v11
+; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s9, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v10, v10, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v11, s0
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SI-NEXT:    s_cmp_gt_i32 s9, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v11, s8
+; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v11, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v8, v10, v8, vcc
+; SI-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
 ; SI-NEXT:    v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1]
 ; SI-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
 ; SI-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
@ -5264,20 +5287,26 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_nop 1
 ; SI-NEXT:    v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13]
 ; SI-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
-; SI-NEXT:    v_readfirstlane_b32 s0, v6
-; SI-NEXT:    v_readfirstlane_b32 s1, v7
-; SI-NEXT:    s_bfe_u32 s8, s1, 0xb0014
-; SI-NEXT:    s_addk_i32 s8, 0xfc01
-; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_andn2_b64 s[2:3], s[0:1], s[2:3]
-; SI-NEXT:    s_and_b32 s9, s1, 0x80000000
-; SI-NEXT:    s_cmp_lt_i32 s8, 0
-; SI-NEXT:    s_cselect_b32 s2, 0, s2
-; SI-NEXT:    s_cselect_b32 s3, s9, s3
-; SI-NEXT:    s_cmp_gt_i32 s8, 51
-; SI-NEXT:    s_cselect_b32 s1, s1, s3
-; SI-NEXT:    s_cselect_b32 s0, s0, s2
-; SI-NEXT:    v_fma_f64 v[0:1], -s[0:1], v[4:5], v[0:1]
+; SI-NEXT:    v_readfirstlane_b32 s8, v7
+; SI-NEXT:    s_bfe_u32 s0, s8, 0xb0014
+; SI-NEXT:    s_add_i32 s9, s0, 0xfffffc01
+; SI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s9
+; SI-NEXT:    v_not_b32_e32 v8, s0
+; SI-NEXT:    v_and_b32_e32 v8, v6, v8
+; SI-NEXT:    v_not_b32_e32 v9, s1
+; SI-NEXT:    v_and_b32_e32 v7, v7, v9
+; SI-NEXT:    s_and_b32 s0, s8, 0x80000000
+; SI-NEXT:    s_cmp_lt_i32 s9, 0
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e64 v8, v8, 0, vcc
+; SI-NEXT:    v_mov_b32_e32 v9, s0
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    s_cmp_gt_i32 s9, 51
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_mov_b32_e32 v9, s8
+; SI-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v8, v6, vcc
+; SI-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
--- a/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx1250-no-scope-cu-stores.ll
@ -86,15 +86,3 @@ entry:
  store atomic i32 %val, ptr addrspace(3) %dst syncscope("wavefront") unordered, align 4
  ret void
 }
-
-; GCN:     scratch_atomic_store:
-; CU:        scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; NOCU:      scratch_store_b32 off, v{{.*}}, s{{.*}} scope:SCOPE_SE
-; GCN:     .amdhsa_kernel scratch_atomic_store
-; CU:        .amdhsa_uses_cu_stores 1
-; NOCU:      .amdhsa_uses_cu_stores 0
-define amdgpu_kernel void @scratch_atomic_store(ptr addrspace(5) %dst, i32 %val) {
-entry:
-  store atomic i32 %val, ptr addrspace(5) %dst syncscope("wavefront") unordered, align 4
-  ret void
-}
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.a16.dim.ll
@ -18,16 +18,6 @@ define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -84,16 +74,6 @@ define amdgpu_ps <4 x float> @gather4_cube(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_cube:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_CUBE a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_cube:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -152,16 +132,6 @@ define amdgpu_ps <4 x float> @gather4_2darray(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_2darray:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4 v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_2darray:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -220,16 +190,6 @@ define amdgpu_ps <4 x float> @gather4_c_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_c_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -286,16 +246,6 @@ define amdgpu_ps <4 x float> @gather4_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_cl v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -356,16 +306,6 @@ define amdgpu_ps <4 x float> @gather4_c_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_c_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -422,16 +362,6 @@ define amdgpu_ps <4 x float> @gather4_b_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_b_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_b v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_b_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -488,16 +418,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_b_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_c_b_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -556,16 +476,6 @@ define amdgpu_ps <4 x float> @gather4_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_b_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_b_cl v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_b_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -628,16 +538,6 @@ define amdgpu_ps <4 x float> @gather4_c_b_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_b_cl_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    s_mov_b32 s12, exec_lo
-; GFX10-NEXT:    s_wqm_b32 exec_lo, exec_lo
-; GFX10-NEXT:    v_perm_b32 v2, v3, v2, 0x5040100
-; GFX10-NEXT:    s_and_b32 exec_lo, exec_lo, s12
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v0, v1, v2, v4], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_c_b_cl_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    s_mov_b32 s12, exec_lo
@ -691,13 +591,6 @@ define amdgpu_ps <4 x float> @gather4_l_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_l_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    image_gather4_l v[0:3], [v0, v2], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_l_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
@ -743,13 +636,6 @@ define amdgpu_ps <4 x float> @gather4_c_l_2d(<8 x i32> inreg %rsrc, <4 x i32> in
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_l_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    image_gather4_c_l v[0:3], [v0, v1, v3], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_c_l_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
@ -791,13 +677,6 @@ define amdgpu_ps <4 x float> @gather4_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_lz_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX10-NEXT:    image_gather4_lz v[0:3], v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_lz_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
@ -839,13 +718,6 @@ define amdgpu_ps <4 x float> @gather4_c_lz_2d(<8 x i32> inreg %rsrc, <4 x i32> i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: gather4_c_lz_2d:
-; GFX10:       ; %bb.0: ; %main_body
-; GFX10-NEXT:    v_perm_b32 v1, v2, v1, 0x5040100
-; GFX10-NEXT:    image_gather4_c_lz v[0:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D a16
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    ; return to shader part epilog
-;
 ; GFX11-TRUE16-LABEL: gather4_c_lz_2d:
 ; GFX11-TRUE16:       ; %bb.0: ; %main_body
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v2.l
@ -901,4 +773,5 @@ attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
 attributes #2 = { nounwind readnone }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX10: {{.*}}
 ; GFX12: {{.*}}
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@ -76,13 +76,12 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
-; SI-NEXT:    s_movk_i32 s4, 0xfc01
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s3, 0xfffff
 ; SI-NEXT:    v_mov_b32_e32 v8, 0x3ff00000
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_bfe_u32 v4, v3, 20, 11
-; SI-NEXT:    v_add_i32_e32 v6, vcc, s4, v4
+; SI-NEXT:    v_add_i32_e32 v6, vcc, 0xfffffc01, v4
 ; SI-NEXT:    v_lshr_b64 v[4:5], s[2:3], v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v3
 ; SI-NEXT:    v_not_b32_e32 v5, v5
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_and_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_or_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smax.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_smax_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-smin.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_smin_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.b $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.w $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvmin.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.d $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umax.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_umax_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.wu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvmax.du $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvmax.du $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-umin.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_umin_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umin_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.bu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umin_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umin_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.hu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umin_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umin_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.wu $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umin_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umin_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvmin.du $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvmin.du $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vec-reduce-xor.ll
@ -1,95 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lasx %s -o - | FileCheck %s
-
-define void @vec_reduce_xor_v32i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_xor_v32i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 228
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvsrli.d $xr1, $xr1, 32
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.b $xr1, $xr1, 14
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.b $xr1, $xr1, 1
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.b $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <32 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_xor_v16i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_xor_v16i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 228
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvbsrl.v $xr1, $xr1, 8
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.h $xr1, $xr1, 14
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.h $xr1, $xr1, 1
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.h $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_xor_v8i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_xor_v8i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 78
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 228
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvshuf4i.w $xr1, $xr1, 14
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.w $xr1, $xr1, 1
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.w $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_xor_v4i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_xor_v4i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xvld $xr0, $a0, 0
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI3_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.d $xr1, $xr0, $xr2
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 68
-; CHECK-NEXT:    xvrepl128vei.d $xr1, $xr1, 1
-; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvstelm.d $xr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
-
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-and.ll
@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-define void @vec_reduce_and_v16i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v8i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v4i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v2i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v8i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v4i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v2i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v4i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v2i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_and_v2i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_and_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vand.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-or.ll
@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-define void @vec_reduce_or_v16i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v8i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v4i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v2i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v8i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v4i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v2i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v4i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v2i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_or_v2i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_or_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vor.v $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smax.ll
@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-define void @vec_reduce_smax_v16i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v8i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v4i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v2i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v8i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v4i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v2i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v4i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v2i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smax_v2i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smax_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.d $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-smin.ll
@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-define void @vec_reduce_smin_v16i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v8i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v4i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v2i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.b $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v8i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v4i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v2i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.h $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v4i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v2i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.w $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_smin_v2i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_smin_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vmin.d $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
--- a/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/vec-reduce-umax.ll
@ -1,168 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc --mtriple=loongarch64 --mattr=+lsx %s -o - | FileCheck %s
-
-define void @vec_reduce_umax_v16i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v16i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <16 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v8i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v8i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vsrli.d $vr1, $vr0, 32
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v4i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v4i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.b $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v2i8(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v2i8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.h $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.b $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.bu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.b $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i8>, ptr %src
-  %res = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v)
-  store i8 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v8i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v8i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vbsrl.v $vr1, $vr0, 8
-; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <8 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v4i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v4i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.h $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v2i16(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v2i16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.w $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.h $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.hu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.h $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i16>, ptr %src
-  %res = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v)
-  store i16 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v4i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v4i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vshuf4i.w $vr1, $vr0, 14
-; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <4 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v2i32(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v2i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    ld.d $a0, $a0, 0
-; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.wu $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.w $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i32>, ptr %src
-  %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v)
-  store i32 %res, ptr %dst
-  ret void
-}
-
-define void @vec_reduce_umax_v2i64(ptr %src, ptr %dst) nounwind {
-; CHECK-LABEL: vec_reduce_umax_v2i64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vld $vr0, $a0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    vmax.du $vr0, $vr0, $vr1
-; CHECK-NEXT:    vstelm.d $vr0, $a1, 0, 0
-; CHECK-NEXT:    ret
-  %v = load <2 x i64>, ptr %src
-  %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v)
-  store i64 %res, ptr %dst
-  ret void
-}
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
pvanhout	6cd7c41646	[AMDGPU] Expand scratch atomics to flat atomics if GAS is enabled	2025-08-22 12:45:16 +02:00
pvanhout	3c6b5f75a5	[AMDGPU] Precommit memory legalizer tests for private AS	2025-08-22 12:45:15 +02:00
pvanhout	9cdf588d22	Rename "Expand" to "ExpandCustom"	2025-08-22 12:44:48 +02:00
pvanhout	d05704bce4	[CodeGen][TLI] Allow targets to custom expand atomic load/stores Loads didn't have the `Expand` option in `AtomicExpandPass`. Stores had `Expand` but it didn't defer to TLI and instead did an action directly. Move the old behavior to a `XChg` expansion and make `Expand` behave like all other instructions.	2025-08-22 10:14:26 +02:00