diff --git a/.ci/utils.sh b/.ci/utils.sh index 97724444b96f..2a3d2426b630 100644 --- a/.ci/utils.sh +++ b/.ci/utils.sh @@ -24,6 +24,7 @@ function at-exit { retcode=$? mkdir -p artifacts + sccache --show-stats sccache --show-stats >> artifacts/sccache_stats.txt cp "${BUILD_DIR}"/.ninja_log artifacts/.ninja_log cp "${MONOREPO_ROOT}"/*.log artifacts/ || : diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 8e0fa8d42d73..dab3db2616f5 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -90,9 +90,6 @@ LTO: - llvm/lib/Transforms/*/FunctionImport* - llvm/tools/gold/** -mc: - - llvm/*/MC/** - clang:driver: - clang/*/Driver/** @@ -621,6 +618,12 @@ llvm:adt: llvm:support: - llvm/**/Support/** +# Skip llvm/test/MC and llvm/unittests/MC, which includes target-specific directories. +llvm:mc: + - llvm/include/llvm/MC/** + - llvm/lib/MC/** + - llvm/tools/llvm-mc/** + llvm:transforms: - llvm/lib/Transforms/** - llvm/include/llvm/Transforms/** diff --git a/.github/workflows/containers/github-action-ci/Dockerfile b/.github/workflows/containers/github-action-ci/Dockerfile index 227496051af5..8a888f3a411c 100644 --- a/.github/workflows/containers/github-action-ci/Dockerfile +++ b/.github/workflows/containers/github-action-ci/Dockerfile @@ -81,6 +81,8 @@ RUN curl -L 'https://github.com/mozilla/sccache/releases/download/v0.10.0/sccach ENV LLVM_SYSROOT=$LLVM_SYSROOT ENV PATH=${LLVM_SYSROOT}/bin:${PATH} +ENV CC=clang +ENV CXX=clang++ # Create a new user to avoid test failures related to a lack of expected # permissions issues in some tests. Set the user id to 1001 as that is the diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml index 50ef4acf2feb..e16892832556 100644 --- a/.github/workflows/libclang-python-tests.yml +++ b/.github/workflows/libclang-python-tests.yml @@ -4,7 +4,6 @@ permissions: contents: read on: - workflow_dispatch: push: branches: - 'main' @@ -13,29 +12,46 @@ on: - 'clang/tools/libclang/**' - 'clang/CMakeList.txt' - '.github/workflows/libclang-python-tests.yml' - - '.github/workflows/llvm-project-tests.yml' pull_request: paths: - 'clang/bindings/python/**' - 'clang/tools/libclang/**' - 'clang/CMakeList.txt' - '.github/workflows/libclang-python-tests.yml' - - '.github/workflows/llvm-project-tests.yml' jobs: check-clang-python: # Build libclang and then run the libclang Python binding's unit tests. + # There is an issue running on "windows-2019". + # See https://github.com/llvm/llvm-project/issues/76601#issuecomment-1873049082. name: Build and run Python unit tests if: github.repository == 'llvm/llvm-project' + runs-on: ubuntu-24.04 strategy: fail-fast: false matrix: python-version: ["3.8", "3.13"] - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: check-clang-python - projects: clang - # There is an issue running on "windows-2019". - # See https://github.com/llvm/llvm-project/issues/76601#issuecomment-1873049082. - os_list: '["ubuntu-24.04"]' - python_version: ${{ matrix.python-version }} + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - name: Setup Python + uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0 + with: + python-version: ${{ matrix.python-version }} + - name: Setup ccache + uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + with: + max-size: 2G + key: spirv-ubuntu-24.04 + variant: sccache + - name: Build and Test + run: | + mkdir build + cmake -GNinja \ + -S llvm \ + -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=sccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ + -DLLVM_ENABLE_PROJECTS=clang + ninja -C build check-clang-python diff --git a/.github/workflows/mlir-spirv-tests.yml b/.github/workflows/mlir-spirv-tests.yml index 48b6c69a61f5..78952ccad264 100644 --- a/.github/workflows/mlir-spirv-tests.yml +++ b/.github/workflows/mlir-spirv-tests.yml @@ -24,9 +24,28 @@ jobs: check_spirv: if: github.repository_owner == 'llvm' name: Test MLIR SPIR-V - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: check-mlir - projects: mlir - extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="host" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON' - os_list: '["ubuntu-24.04"]' + runs-on: ubuntu-24.04 + container: + image: ghcr.io/llvm/ci-ubuntu-24.04:latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + with: + max-size: 2G + key: spirv-mlir-ubuntu-24.04 + variant: sccache + - name: Build and Test + run: | + mkdir build + cmake -GNinja \ + -S llvm \ + -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=sccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ + -DLLVM_TARGETS_TO_BUILD="host" \ + -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON \ + -DLLVM_ENABLE_PROJECTS=mlir + ninja -C build check-mlir diff --git a/.github/workflows/spirv-tests.yml b/.github/workflows/spirv-tests.yml index f15ca1cb64ba..8708fb06d9eb 100644 --- a/.github/workflows/spirv-tests.yml +++ b/.github/workflows/spirv-tests.yml @@ -4,7 +4,6 @@ permissions: contents: read on: - workflow_dispatch: pull_request: paths: - 'llvm/lib/Target/SPIRV/**' @@ -21,9 +20,27 @@ jobs: check_spirv: if: github.repository_owner == 'llvm' name: Test SPIR-V - uses: ./.github/workflows/llvm-project-tests.yml - with: - build_target: check-llvm-codegen-spirv - projects: - extra_cmake_args: '-DLLVM_TARGETS_TO_BUILD="SPIRV" -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON' - os_list: '["ubuntu-24.04"]' + runs-on: ubuntu-24.04 + container: + image: ghcr.io/llvm/ci-ubuntu-24.04:latest + steps: + - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + - name: Setup ccache + uses: hendrikmuhs/ccache-action@a1209f81afb8c005c13b4296c32e363431bffea5 # v1.2.17 + with: + max-size: 2G + key: spirv-ubuntu-24.04 + variant: sccache + - name: Build and Test + run: | + mkdir build + cmake -GNinja \ + -S llvm \ + -B build \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DCMAKE_C_COMPILER_LAUNCHER=sccache \ + -DCMAKE_CXX_COMPILER_LAUNCHER=sccache \ + -DLLVM_TARGETS_TO_BUILD="SPIRV" \ + -DLLVM_INCLUDE_SPIRV_TOOLS_TESTS=ON + ninja -C build check-llvm-codegen-spirv diff --git a/.gitignore b/.gitignore index a84268a7f686..860b8ea12abd 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,11 @@ autoconf/autom4te.cache # CLion project configuration /.idea /cmake-build* +# Coding assistants' stuff +/CLAUDE.md +/.claude/ +/GEMINI.md +/.gemini/ #==============================================================================# # Directories to ignore (do not add trailing '/'s, they skip symlinks). diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index f902a8c43cd1..e773250ce873 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -740,6 +740,10 @@ public: return false; } + /// Return true if the hlt instruction under the x86, otherwise, default to + /// false. + virtual bool isX86HLT(const MCInst &Inst) const { return false; } + /// Return the width, in bytes, of the memory access performed by \p Inst, if /// this is a pop instruction. Return zero otherwise. virtual int getPopSize(const MCInst &Inst) const { diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 84f185346970..da59a188c6b6 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -2517,7 +2517,7 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) { // Clean-up the effect of the code emission. for (const MCSymbol &Symbol : Assembler.symbols()) { MCSymbol *MutableSymbol = const_cast(&Symbol); - MutableSymbol->setUndefined(); + MutableSymbol->setFragment(nullptr); MutableSymbol->setIsRegistered(false); } diff --git a/bolt/lib/Core/MCPlusBuilder.cpp b/bolt/lib/Core/MCPlusBuilder.cpp index fa8f4d1df308..d8a2ac6f6837 100644 --- a/bolt/lib/Core/MCPlusBuilder.cpp +++ b/bolt/lib/Core/MCPlusBuilder.cpp @@ -132,8 +132,10 @@ bool MCPlusBuilder::equals(const MCSpecifierExpr &A, const MCSpecifierExpr &B, } bool MCPlusBuilder::isTerminator(const MCInst &Inst) const { - return Analysis->isTerminator(Inst) || - (opts::TerminalTrap && Info->get(Inst.getOpcode()).isTrap()); + return (opts::TerminalTrap && Info->get(Inst.getOpcode()).isTrap()) || + Analysis->isTerminator(Inst) + ? !isX86HLT(Inst) + : false; } void MCPlusBuilder::setTailCall(MCInst &Inst) const { diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 5d44e1a1a490..d7f02b947003 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -662,7 +662,7 @@ Error CleanMCState::runOnFunctions(BinaryContext &BC) { if (S->isDefined()) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is already defined\n"); - const_cast(S)->setUndefined(); + const_cast(S)->setFragment(nullptr); } if (S->isRegistered()) { LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index a60c1a6bf156..1842509dcc5e 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -223,6 +223,10 @@ public: return Inst.getOpcode() == X86::ENDBR32 || Inst.getOpcode() == X86::ENDBR64; } + bool isX86HLT(const MCInst &Inst) const override { + return Inst.getOpcode() == X86::HLT; + } + int getPopSize(const MCInst &Inst) const override { switch (Inst.getOpcode()) { case X86::POP16r: diff --git a/bolt/test/X86/cfg_build_hlt.s b/bolt/test/X86/cfg_build_hlt.s new file mode 100644 index 000000000000..a78134df3401 --- /dev/null +++ b/bolt/test/X86/cfg_build_hlt.s @@ -0,0 +1,17 @@ +## Check CFG for halt instruction + +# RUN: %clang %cflags %s -static -o %t.exe -nostdlib +# RUN: llvm-bolt %t.exe --print-cfg --print-only=main -o %t 2>&1 | FileCheck %s --check-prefix=CHECK-CFG +# RUN: llvm-objdump -d %t --print-imm-hex | FileCheck %s --check-prefix=CHECK-BIN + +# CHECK-CFG: BB Count : 1 +# CHECK-BIN:
: +# CHECK-BIN-NEXT: f4 hlt +# CHECK-BIN-NEXT: c3 retq + +.global main + .type main, %function +main: + hlt + retq +.size main, .-main diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp index 4ae2864d310d..b612d4f18acc 100644 --- a/clang-tools-extra/clang-tidy/ClangTidy.cpp +++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp @@ -424,6 +424,10 @@ ClangTidyASTConsumerFactory::createASTConsumer( FinderOptions.CheckProfiling.emplace(Profiling->Records); } + // Avoid processing system headers, unless the user explicitly requests it + if (!Context.getOptions().SystemHeaders.value_or(false)) + FinderOptions.IgnoreSystemHeaders = true; + std::unique_ptr Finder( new ast_matchers::MatchFinder(std::move(FinderOptions))); diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp index 593a4f85d130..79cd4bbcc9a6 100644 --- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp +++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp @@ -191,6 +191,9 @@ void PreferMemberInitializerCheck::check( if (!AssignmentToMember) continue; const FieldDecl *Field = AssignmentToMember->Field; + // Skip if the field is inherited from a base class. + if (Field->getParent() != Class) + continue; const Expr *InitValue = AssignmentToMember->Init; updateAssignmentLevel(Field, InitValue, Ctor, AssignedFields); if (!canAdvanceAssignment(AssignedFields[Field])) diff --git a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt index fd7affd22a46..2cfee5fd1071 100644 --- a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt @@ -32,6 +32,7 @@ add_clang_library(clangTidyMiscModule STATIC NoRecursionCheck.cpp NonCopyableObjects.cpp NonPrivateMemberVariablesInClassesCheck.cpp + OverrideWithDifferentVisibilityCheck.cpp RedundantExpressionCheck.cpp StaticAssertCheck.cpp ThrowByValueCatchByReferenceCheck.cpp diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp index 6ddebcbc0e15..f675ca70deb9 100644 --- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp @@ -22,6 +22,7 @@ #include "NoRecursionCheck.h" #include "NonCopyableObjects.h" #include "NonPrivateMemberVariablesInClassesCheck.h" +#include "OverrideWithDifferentVisibilityCheck.h" #include "RedundantExpressionCheck.h" #include "StaticAssertCheck.h" #include "ThrowByValueCatchByReferenceCheck.h" @@ -81,6 +82,8 @@ public: "misc-use-anonymous-namespace"); CheckFactories.registerCheck( "misc-use-internal-linkage"); + CheckFactories.registerCheck( + "misc-override-with-different-visibility"); } }; diff --git a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp new file mode 100644 index 000000000000..12f78affe463 --- /dev/null +++ b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.cpp @@ -0,0 +1,150 @@ +//===--- OverrideWithDifferentVisibilityCheck.cpp - clang-tidy ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "OverrideWithDifferentVisibilityCheck.h" +#include "../utils/Matchers.h" +#include "../utils/OptionsUtils.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" + +using namespace clang::ast_matchers; +using namespace clang; + +namespace { + +AST_MATCHER(NamedDecl, isOperatorDecl) { + DeclarationName::NameKind const NK = Node.getDeclName().getNameKind(); + return NK != DeclarationName::Identifier && + NK != DeclarationName::CXXConstructorName && + NK != DeclarationName::CXXDestructorName; +} + +} // namespace + +namespace clang::tidy { + +template <> +struct OptionEnumMapping< + misc::OverrideWithDifferentVisibilityCheck::ChangeKind> { + static llvm::ArrayRef> + getEnumMapping() { + static constexpr std::pair< + misc::OverrideWithDifferentVisibilityCheck::ChangeKind, StringRef> + Mapping[] = { + {misc::OverrideWithDifferentVisibilityCheck::ChangeKind::Any, + "any"}, + {misc::OverrideWithDifferentVisibilityCheck::ChangeKind::Widening, + "widening"}, + {misc::OverrideWithDifferentVisibilityCheck::ChangeKind::Narrowing, + "narrowing"}, + }; + return {Mapping}; + } +}; + +namespace misc { + +OverrideWithDifferentVisibilityCheck::OverrideWithDifferentVisibilityCheck( + StringRef Name, ClangTidyContext *Context) + : ClangTidyCheck(Name, Context), + DetectVisibilityChange( + Options.get("DisallowedVisibilityChange", ChangeKind::Any)), + CheckDestructors(Options.get("CheckDestructors", false)), + CheckOperators(Options.get("CheckOperators", false)), + IgnoredFunctions(utils::options::parseStringList( + Options.get("IgnoredFunctions", ""))) {} + +void OverrideWithDifferentVisibilityCheck::storeOptions( + ClangTidyOptions::OptionMap &Opts) { + Options.store(Opts, "DisallowedVisibilityChange", DetectVisibilityChange); + Options.store(Opts, "CheckDestructors", CheckDestructors); + Options.store(Opts, "CheckOperators", CheckOperators); + Options.store(Opts, "IgnoredFunctions", + utils::options::serializeStringList(IgnoredFunctions)); +} + +void OverrideWithDifferentVisibilityCheck::registerMatchers( + MatchFinder *Finder) { + const auto IgnoredDecl = + namedDecl(matchers::matchesAnyListedName(IgnoredFunctions)); + const auto FilterDestructors = + CheckDestructors ? decl() : decl(unless(cxxDestructorDecl())); + const auto FilterOperators = + CheckOperators ? namedDecl() : namedDecl(unless(isOperatorDecl())); + Finder->addMatcher( + cxxMethodDecl( + isVirtual(), FilterDestructors, FilterOperators, + ofClass( + cxxRecordDecl(unless(isExpansionInSystemHeader())).bind("class")), + forEachOverridden(cxxMethodDecl(ofClass(cxxRecordDecl().bind("base")), + unless(IgnoredDecl)) + .bind("base_func"))) + .bind("func"), + this); +} + +void OverrideWithDifferentVisibilityCheck::check( + const MatchFinder::MatchResult &Result) { + const auto *const MatchedFunction = + Result.Nodes.getNodeAs("func"); + if (!MatchedFunction->isCanonicalDecl()) + return; + + const auto *const ParentClass = + Result.Nodes.getNodeAs("class"); + const auto *const BaseClass = Result.Nodes.getNodeAs("base"); + CXXBasePaths Paths; + if (!ParentClass->isDerivedFrom(BaseClass, Paths)) + return; + + const auto *const OverriddenFunction = + Result.Nodes.getNodeAs("base_func"); + AccessSpecifier const ActualAccess = MatchedFunction->getAccess(); + AccessSpecifier OverriddenAccess = OverriddenFunction->getAccess(); + + const CXXBaseSpecifier *InheritanceWithStrictVisibility = nullptr; + for (const CXXBasePath &Path : Paths) { + for (const CXXBasePathElement &Elem : Path) { + if (Elem.Base->getAccessSpecifier() > OverriddenAccess) { + OverriddenAccess = Elem.Base->getAccessSpecifier(); + InheritanceWithStrictVisibility = Elem.Base; + } + } + } + + if (ActualAccess != OverriddenAccess) { + if (DetectVisibilityChange == ChangeKind::Widening && + ActualAccess > OverriddenAccess) + return; + if (DetectVisibilityChange == ChangeKind::Narrowing && + ActualAccess < OverriddenAccess) + return; + + if (InheritanceWithStrictVisibility) { + diag(MatchedFunction->getLocation(), + "visibility of function %0 is changed from %1 (through %1 " + "inheritance of class %2) to %3") + << MatchedFunction << OverriddenAccess + << InheritanceWithStrictVisibility->getType() << ActualAccess; + diag(InheritanceWithStrictVisibility->getBeginLoc(), + "%0 is inherited as %1 here", DiagnosticIDs::Note) + << InheritanceWithStrictVisibility->getType() << OverriddenAccess; + } else { + diag(MatchedFunction->getLocation(), + "visibility of function %0 is changed from %1 in class %2 to %3") + << MatchedFunction << OverriddenAccess << BaseClass << ActualAccess; + } + diag(OverriddenFunction->getLocation(), "function declared here as %0", + DiagnosticIDs::Note) + << OverriddenFunction->getAccess(); + } +} + +} // namespace misc + +} // namespace clang::tidy diff --git a/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h new file mode 100644 index 000000000000..1f5222d99196 --- /dev/null +++ b/clang-tools-extra/clang-tidy/misc/OverrideWithDifferentVisibilityCheck.h @@ -0,0 +1,43 @@ +//===--- OverrideWithDifferentVisibilityCheck.h - clang-tidy --*- C++ -*---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OVERRIDEWITHDIFFERENTVISIBILITYCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OVERRIDEWITHDIFFERENTVISIBILITYCHECK_H + +#include "../ClangTidyCheck.h" + +namespace clang::tidy::misc { + +/// Finds virtual function overrides with different visibility than the function +/// in the base class. +/// +/// For the user-facing documentation see: +/// http://clang.llvm.org/extra/clang-tidy/checks/misc/override-with-different-visibility.html +class OverrideWithDifferentVisibilityCheck : public ClangTidyCheck { +public: + enum class ChangeKind { Any, Widening, Narrowing }; + + OverrideWithDifferentVisibilityCheck(StringRef Name, + ClangTidyContext *Context); + void storeOptions(ClangTidyOptions::OptionMap &Opts) override; + void registerMatchers(ast_matchers::MatchFinder *Finder) override; + void check(const ast_matchers::MatchFinder::MatchResult &Result) override; + bool isLanguageVersionSupported(const LangOptions &LangOpts) const override { + return LangOpts.CPlusPlus; + } + +private: + ChangeKind DetectVisibilityChange; + bool CheckDestructors; + bool CheckOperators; + std::vector IgnoredFunctions; +}; + +} // namespace clang::tidy::misc + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_MISC_OVERRIDEWITHDIFFERENTVISIBILITYCHECK_H diff --git a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py index 7cd21afd70f7..d7899e0a18d0 100755 --- a/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py +++ b/clang-tools-extra/clang-tidy/tool/clang-tidy-diff.py @@ -28,6 +28,7 @@ import glob import json import multiprocessing import os +import queue import re import shutil import subprocess @@ -42,13 +43,6 @@ try: except ImportError: yaml = None -is_py2 = sys.version[0] == "2" - -if is_py2: - import Queue as queue -else: - import queue as queue - def run_tidy(task_queue, lock, timeout, failed_files): watchdog = None diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 1553f461634d..aab76ac24bc0 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -104,6 +104,10 @@ Improvements to clang-query Improvements to clang-tidy -------------------------- +- :program:`clang-tidy` no longer attemps to analyze code from system headers + by default, greatly improving performance. This behavior is disabled if the + `SystemHeaders` option is enabled. + - The :program:`run-clang-tidy.py` and :program:`clang-tidy-diff.py` scripts now run checks in parallel by default using all available hardware threads. Both scripts display the number of threads being used in their output. @@ -130,6 +134,12 @@ New checks Checks for uses of MLIR's old/to be deprecated ``OpBuilder::create`` form and suggests using ``T::create`` instead. +- New :doc:`misc-override-with-different-visibility + ` check. + + Finds virtual function overrides with different visibility than the function + in the base class. + New check aliases ^^^^^^^^^^^^^^^^^ @@ -163,6 +173,10 @@ Changes in existing checks an additional matcher that generalizes the copy-and-swap idiom pattern detection. +- Improved :doc:`cppcoreguidelines-prefer-member-initializer + ` check to + avoid false positives on inherited members in class templates. + - Improved :doc:`misc-header-include-cycle ` check performance. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index b6444eb3c9ae..b0961265345c 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -271,6 +271,7 @@ Clang-Tidy Checks :doc:`misc-no-recursion `, :doc:`misc-non-copyable-objects `, :doc:`misc-non-private-member-variables-in-classes `, + :doc:`misc-override-with-different-visibility `, :doc:`misc-redundant-expression `, "Yes" :doc:`misc-static-assert `, "Yes" :doc:`misc-throw-by-value-catch-by-reference `, diff --git a/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst b/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst new file mode 100644 index 000000000000..310bfe2b0108 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/misc/override-with-different-visibility.rst @@ -0,0 +1,87 @@ +.. title:: clang-tidy - misc-override-with-different-visibility + +misc-override-with-different-visibility +======================================= + +Finds virtual function overrides with different visibility than the function +in the base class. This includes for example if a virtual function declared as +``private`` is overridden and declared as ``public`` in a subclass. The detected +change is the modification of visibility resulting from keywords ``public``, +``protected``, ``private`` at overridden virtual functions. The check applies to +any normal virtual function and optionally to destructors or operators. Use of +the ``using`` keyword is not considered as visibility change by this check. + + +.. code-block:: c++ + + class A { + public: + virtual void f_pub(); + private: + virtual void f_priv(); + }; + + class B: public A { + public: + void f_priv(); // warning: changed visibility from private to public + private: + void f_pub(); // warning: changed visibility from public to private + }; + + class C: private A { + // no warning: f_pub becomes private in this case but this is from the + // private inheritance + }; + + class D: private A { + public: + void f_pub(); // warning: changed visibility from private to public + // 'f_pub' would have private access but is forced to be + // public + }; + +If the visibility is changed in this way, it can indicate bad design or +programming error. + +If a virtual function is private in a subclass but public in the base class, it +can still be accessed from a pointer to the subclass if the pointer is converted +to the base type. Probably private inheritance can be used instead. + +A protected virtual function that is made public in a subclass may have valid +use cases but similar (not exactly same) effect can be achieved with the +``using`` keyword. + +Options +------- + +.. option:: DisallowedVisibilityChange + + Controls what kind of change to the visibility will be detected by the check. + Possible values are `any`, `widening`, `narrowing`. For example the + `widening` option will produce warning only if the visibility is changed + from more restrictive (``private``) to less restrictive (``public``). + Default value is `any`. + +.. option:: CheckDestructors + + If `true`, the check does apply to destructors too. Otherwise destructors + are ignored by the check. + Default value is `false`. + +.. option:: CheckOperators + + If `true`, the check does apply to overloaded C++ operators (as virtual + member functions) too. This includes other special member functions (like + conversions) too. This option is probably useful only in rare cases because + operators and conversions are not often virtual functions. + Default value is `false`. + +.. option:: IgnoredFunctions + + This option can be used to ignore the check at specific functions. + To configure this option, a semicolon-separated list of function names + should be provided. The list can contain regular expressions, in this way it + is possible to select all functions of a specific class (like `MyClass::.*`) + or a specific function of any class (like `my_function` or + `::.*::my_function`). The function names are matched at the base class. + Default value is empty string. diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index e8ce903fcb07..e0cf5ef720b0 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -111,6 +111,13 @@ Diagnostics which have a corresponding warning option, are named ``-Wliteral-conversion`` will be reported with check name ``clang-diagnostic-literal-conversion``. +Clang compiler errors (such as syntax errors, semantic errors, or other failures +that prevent Clang from compiling the code) are reported with the check name +``clang-diagnostic-error``. These represent fundamental compilation failures that +must be fixed before :program:`clang-tidy` can perform its analysis. Unlike other +diagnostics, ``clang-diagnostic-error`` cannot be disabled, as :program:`clang-tidy` +requires valid code to function. + The ``-fix`` flag instructs :program:`clang-tidy` to fix found errors if supported by corresponding checks. diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp index 7d6164946fc3..e8d7db17f3c6 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp @@ -650,3 +650,16 @@ struct InitFromBindingDecl { } }; } // namespace GH82970 + +struct A { + int m; +}; + +struct B : A { + B() { m = 0; } +}; + +template +struct C : A { + C() { m = 0; } +}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/override-with-different-visibility/test-system-header.h b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/override-with-different-visibility/test-system-header.h new file mode 100644 index 000000000000..e64e1924a170 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/Inputs/override-with-different-visibility/test-system-header.h @@ -0,0 +1,14 @@ +#pragma clang system_header + +namespace sys { + +struct Base { + virtual void publicF(); +}; + +struct Derived: public Base { +private: + void publicF() override; +}; + +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility-ignore.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility-ignore.cpp new file mode 100644 index 000000000000..934cfb7bc708 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility-ignore.cpp @@ -0,0 +1,60 @@ +// RUN: %check_clang_tidy %s misc-override-with-different-visibility %t -- \ +// RUN: -config="{CheckOptions: {misc-override-with-different-visibility.IgnoredFunctions: 'IgnoreAlways::.*;::a::IgnoreSelected::.*;IgnoreFunctions::f1;ignored_f'}}" + +class IgnoreAlways { + virtual void f(); +}; + +class IgnoreSelected { + virtual void f(); +}; + +namespace a { +class IgnoreAlways { + virtual void f(); +}; +class IgnoreSelected { + virtual void f(); +}; +} + +namespace ignore_always { +class Test1: public IgnoreAlways { +public: + void f(); + void ignored_f(int); +}; +class Test2: public a::IgnoreAlways { +public: + void f(); +}; +} + +namespace ignore_selected { +class Test1: public IgnoreSelected { +public: + void f(); + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'f' + // CHECK-MESSAGES: :9:16: note: function declared here + void ignored_f(int); +}; +class Test2: public a::IgnoreSelected { +public: + void f(); +}; +} + +class IgnoreFunctions { + virtual void f1(); + virtual void f2(); + virtual void ignored_f(); +}; + +class IgnoreFunctionsTest: public IgnoreFunctions { +public: + void f1(); + void f2(); + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'f2' + // CHECK-MESSAGES: :[[@LINE-9]]:16: note: function declared here + void ignored_f(); +}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility-options.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility-options.cpp new file mode 100644 index 000000000000..0a363ddee380 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility-options.cpp @@ -0,0 +1,75 @@ +// RUN: %check_clang_tidy -check-suffixes=DTORS,WIDENING,NARROWING %s misc-override-with-different-visibility %t -- \ +// RUN: -config="{CheckOptions: {misc-override-with-different-visibility.CheckDestructors: true}}" + +// RUN: %check_clang_tidy -check-suffixes=OPS,WIDENING,NARROWING %s misc-override-with-different-visibility %t -- \ +// RUN: -config="{CheckOptions: {misc-override-with-different-visibility.CheckOperators: true}}" + +// RUN: %check_clang_tidy -check-suffixes=WIDENING %s misc-override-with-different-visibility %t -- \ +// RUN: -config="{CheckOptions: {misc-override-with-different-visibility.DisallowedVisibilityChange: 'widening'}}" + +// RUN: %check_clang_tidy -check-suffixes=NARROWING %s misc-override-with-different-visibility %t -- \ +// RUN: -config="{CheckOptions: {misc-override-with-different-visibility.DisallowedVisibilityChange: 'narrowing'}}" + +namespace test_change { + +class A { +protected: + virtual void f1(); + virtual void f2(); +}; + +class B: public A { +public: + void f1(); + // CHECK-MESSAGES-WIDENING: :[[@LINE-1]]:8: warning: visibility of function 'f1' + // CHECK-MESSAGES-WIDENING: :[[@LINE-8]]:16: note: function declared here +private: + void f2(); + // CHECK-MESSAGES-NARROWING: :[[@LINE-1]]:8: warning: visibility of function 'f2' + // CHECK-MESSAGES-NARROWING: :[[@LINE-11]]:16: note: function declared here +}; + +} + +namespace test_destructor { + +class A { +public: + virtual ~A(); +}; + +class B: public A { +protected: + ~B(); + // CHECK-MESSAGES-DTORS: :[[@LINE-1]]:3: warning: visibility of function '~B' + // CHECK-MESSAGES-DTORS: :[[@LINE-7]]:11: note: function declared here +}; + +} + +namespace test_operator { + +class A { + virtual A& operator=(const A&); + virtual A& operator++(); + virtual int operator()(int); + virtual operator double() const; +}; + +class B: public A { +protected: + A& operator=(const A&); + // CHECK-MESSAGES-OPS: :[[@LINE-1]]:6: warning: visibility of function 'operator=' + // CHECK-MESSAGES-OPS: :[[@LINE-10]]:14: note: function declared here + A& operator++(); + // CHECK-MESSAGES-OPS: :[[@LINE-1]]:6: warning: visibility of function 'operator++' + // CHECK-MESSAGES-OPS: :[[@LINE-12]]:14: note: function declared here + int operator()(int); + // CHECK-MESSAGES-OPS: :[[@LINE-1]]:7: warning: visibility of function 'operator()' + // CHECK-MESSAGES-OPS: :[[@LINE-14]]:15: note: function declared here + operator double() const; + // CHECK-MESSAGES-OPS: :[[@LINE-1]]:3: warning: visibility of function 'operator double' + // CHECK-MESSAGES-OPS: :[[@LINE-16]]:11: note: function declared here +}; + +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility.cpp new file mode 100644 index 000000000000..fd541a44dc25 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc/override-with-different-visibility.cpp @@ -0,0 +1,289 @@ +// RUN: %check_clang_tidy %s misc-override-with-different-visibility %t -- -config="{CheckOptions: {misc-override-with-different-visibility.CheckDestructors: true,misc-override-with-different-visibility.CheckOperators: true}}" -- -I %S/Inputs/override-with-different-visibility +#include +class A { +public: + virtual void pub_foo1() {} + virtual void pub_foo2() {} + virtual void pub_foo3() {} +protected: + virtual void prot_foo1(); + virtual void prot_foo2(); + virtual void prot_foo3(); +private: + virtual void priv_foo1() {} + virtual void priv_foo2() {} + virtual void priv_foo3() {} +}; + +void A::prot_foo1() {} +void A::prot_foo2() {} +void A::prot_foo3() {} + +namespace test1 { + +class B: public A { +public: + void pub_foo1() override {} + void prot_foo1() override {} + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo1' is changed from protected in class 'A' to public [misc-override-with-different-visibility] + // CHECK-MESSAGES: :9:16: note: function declared here as protected + void priv_foo1() override {} + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'priv_foo1' is changed from private in class 'A' to public [misc-override-with-different-visibility] + // CHECK-MESSAGES: :13:16: note: function declared here as private +protected: + void pub_foo2() override {} + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'pub_foo2' is changed from public in class 'A' to protected [misc-override-with-different-visibility] + // CHECK-MESSAGES: :6:16: note: function declared here as public + void prot_foo2() override {} + void priv_foo2() override {} + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'priv_foo2' is changed from private in class 'A' to protected [misc-override-with-different-visibility] + // CHECK-MESSAGES: :14:16: note: function declared here as private +private: + void pub_foo3() override {} + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'pub_foo3' is changed from public in class 'A' to private [misc-override-with-different-visibility] + // CHECK-MESSAGES: :7:16: note: function declared here as public + void prot_foo3() override {} + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo3' is changed from protected in class 'A' to private [misc-override-with-different-visibility] + // CHECK-MESSAGES: :11:16: note: function declared here as protected + void priv_foo3() override {} +}; + +class C: public B { +public: + void pub_foo1() override; +protected: + void prot_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo1' is changed from public in class 'B' to protected [misc-override-with-different-visibility] + // CHECK-MESSAGES: :27:8: note: function declared here as public +private: + void priv_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'priv_foo1' is changed from public in class 'B' to private [misc-override-with-different-visibility] + // CHECK-MESSAGES: :30:8: note: function declared here as public +}; + +void C::prot_foo1() {} +void C::priv_foo1() {} + +} + +namespace test2 { + +class B: public A { +public: + void pub_foo1() override; +protected: + void prot_foo1() override; +private: + void priv_foo1() override; +}; + +class C: public B { +public: + void pub_foo1() override; + void prot_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo1' is changed from protected in class 'B' to public + // CHECK-MESSAGES: :75:8: note: function declared here as protected + void priv_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'priv_foo1' is changed from private in class 'B' to public + // CHECK-MESSAGES: :77:8: note: function declared here as private + + void pub_foo2() override; + void prot_foo2() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo2' is changed from protected in class 'A' to public + // CHECK-MESSAGES: :10:16: note: function declared here as protected + void priv_foo2() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'priv_foo2' is changed from private in class 'A' to public + // CHECK-MESSAGES: :14:16: note: function declared here as private +}; + +} + +namespace test3 { + +class B: private A { +public: + void pub_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'pub_foo1' is changed from private (through private inheritance of class 'A') to public + // CHECK-MESSAGES: :103:10: note: 'A' is inherited as private here + // CHECK-MESSAGES: :5:16: note: function declared here as public +protected: + void prot_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo1' is changed from private (through private inheritance of class 'A') to protected + // CHECK-MESSAGES: :103:10: note: 'A' is inherited as private here + // CHECK-MESSAGES: :9:16: note: function declared here as protected +private: + void priv_foo1() override; + +public: + void prot_foo2() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'prot_foo2' is changed from private (through private inheritance of class 'A') to public + // CHECK-MESSAGES: :103:10: note: 'A' is inherited as private here + // CHECK-MESSAGES: :10:16: note: function declared here as protected + void priv_foo2() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'priv_foo2' is changed from private in class 'A' to public + // CHECK-MESSAGES: :14:16: note: function declared here as private + +private: + void pub_foo3() override; + void prot_foo3() override; +}; + +class C: private A { +}; + +class D: public C { +public: + void pub_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'pub_foo1' is changed from private (through private inheritance of class 'A') to public + // CHECK-MESSAGES: :131:10: note: 'A' is inherited as private here + // CHECK-MESSAGES: :5:16: note: function declared here as public +}; + + +} + +namespace test4 { + +struct Base1 { +public: + virtual void foo1(); +private: + virtual void foo2(); +}; + +struct Base2 { +public: + virtual void foo2(); +private: + virtual void foo1(); +}; + +struct A : public Base1, public Base2 { +protected: + void foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'foo1' is changed from private in class 'Base2' to protected + // CHECK-MESSAGES: :158:16: note: function declared here as private + // CHECK-MESSAGES: :[[@LINE-3]]:8: warning: visibility of function 'foo1' is changed from public in class 'Base1' to protected + // CHECK-MESSAGES: :149:16: note: function declared here as public +private: + void foo2() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'foo2' is changed from public in class 'Base2' to private + // CHECK-MESSAGES: :156:16: note: function declared here as public +}; + +} + +namespace test5 { + +struct B1: virtual public A {}; +struct B2: virtual private A {}; +struct B: public B1, public B2 { +public: + void pub_foo1() override; + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'pub_foo1' is changed from private (through private inheritance of class 'A') to public + // CHECK-MESSAGES: :179:12: note: 'A' is inherited as private here + // CHECK-MESSAGES: :5:16: note: function declared here as public +}; + +} + +namespace test_using { + +class A { +private: + A(int); +protected: + virtual void f(); +}; + +class B: public A { +public: + using A::A; + using A::f; +}; + +} + +namespace test_template { + +template +class A { +protected: + virtual T foo(); +}; + +template +class B: public A { +private: + T foo() override; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: visibility of function 'foo' is changed from protected in class 'A' to private + // CHECK-MESSAGES: :[[@LINE-8]]:13: note: function declared here as protected +}; + +template +class C: private A { +public: + T foo() override; + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: visibility of function 'foo' is changed from private (through private inheritance of class 'A') to public + // CHECK-MESSAGES: :[[@LINE-4]]:10: note: 'A' is inherited as private here + // CHECK-MESSAGES: :[[@LINE-17]]:13: note: function declared here as protected +}; + +B fB() { + return B{}; +} + +C fC() { + return C{}; +} + +} + +namespace test_system_header { + +struct SysDerived: public sys::Base { +private: + void publicF(); + // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: visibility of function 'publicF' is changed from public in class 'Base' to private +}; + +} + +namespace test_destructor { + +class A { +public: + virtual ~A(); +}; + +class B: public A { +protected: + ~B(); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: visibility of function '~B' + // CHECK-MESSAGES: :[[@LINE-7]]:11: note: function declared here +}; + +} + +namespace test_operator { + +class A { + virtual int operator()(int); + virtual A& operator++(); + virtual operator double() const; +}; + +class B: public A { +protected: + int operator()(int); + // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: visibility of function 'operator()' + // CHECK-MESSAGES: :[[@LINE-9]]:15: note: function declared here + A& operator++(); + // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: visibility of function 'operator++' + // CHECK-MESSAGES: :[[@LINE-11]]:14: note: function declared here + operator double() const; + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: visibility of function 'operator double' + // CHECK-MESSAGES: :[[@LINE-13]]:11: note: function declared here +}; + +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits-GH153649.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits-GH153649.cpp new file mode 100644 index 000000000000..142eb5847ae1 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/type-traits-GH153649.cpp @@ -0,0 +1,15 @@ +// RUN: %check_clang_tidy -std=c++20 %s modernize-type-traits %t + +namespace std { +template struct tuple_size { + static const int value = 1; +}; +template struct tuple_element { + using type = int; +}; +} + +struct A {}; +template int get(const A&); + +auto [a] = A(); diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp index 448ef9ddf166..d9ec1049963b 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/file-filter.cpp @@ -66,19 +66,14 @@ class A { A(int); }; // CHECK4-NOT: warning: // CHECK4-QUIET-NOT: warning: -// CHECK: Suppressed 3 warnings (3 in non-user code) // CHECK: Use -header-filter=.* to display errors from all non-system headers. // CHECK-QUIET-NOT: Suppressed -// CHECK2: Suppressed 1 warnings (1 in non-user code) -// CHECK2: Use -header-filter=.* {{.*}} // CHECK2-QUIET-NOT: Suppressed -// CHECK3: Suppressed 2 warnings (2 in non-user code) // CHECK3: Use -header-filter=.* {{.*}} // CHECK3-QUIET-NOT: Suppressed // CHECK4-NOT: Suppressed {{.*}} warnings // CHECK4-NOT: Use -header-filter=.* {{.*}} // CHECK4-QUIET-NOT: Suppressed -// CHECK6: Suppressed 2 warnings (2 in non-user code) // CHECK6: Use -header-filter=.* {{.*}} int x = 123; diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/system-headers.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/system-headers.cpp index 9fa990b6aac8..a25480e9aa39 100644 --- a/clang-tools-extra/test/clang-tidy/infrastructure/system-headers.cpp +++ b/clang-tools-extra/test/clang-tidy/infrastructure/system-headers.cpp @@ -11,9 +11,9 @@ // RUN: clang-tidy -help | FileCheck -check-prefix=CHECK-OPT-PRESENT %s // RUN: clang-tidy -checks='-*,google-explicit-constructor' -header-filter='.*' -system-headers=true %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-SYSTEM-HEADERS %s -// RUN: clang-tidy -checks='-*,google-explicit-constructor' -header-filter='.*' -system-headers=false %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-NO-SYSTEM-HEADERS %s +// RUN: clang-tidy -checks='-*,google-explicit-constructor' -header-filter='.*' -system-headers=false %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-NO-SYSTEM-HEADERS --allow-empty %s // RUN: clang-tidy -checks='-*,google-explicit-constructor' -header-filter='.*' -config='SystemHeaders: true' %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-SYSTEM-HEADERS %s -// RUN: clang-tidy -checks='-*,google-explicit-constructor' -header-filter='.*' -config='SystemHeaders: false' %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-NO-SYSTEM-HEADERS %s +// RUN: clang-tidy -checks='-*,google-explicit-constructor' -header-filter='.*' -config='SystemHeaders: false' %s -- -isystem %S/Inputs/system-headers 2>&1 | FileCheck -check-prefix=CHECK-NO-SYSTEM-HEADERS --allow-empty %s #include // CHECK-SYSTEM-HEADERS: system_header.h:1:13: warning: single-argument constructors must be marked explicit diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst index 02986a94a656..3ac9e3795cae 100644 --- a/clang/docs/ClangFormatStyleOptions.rst +++ b/clang/docs/ClangFormatStyleOptions.rst @@ -126,6 +126,9 @@ clang-format is turned off or back on. // clang-format on void formatted_code_again; +In addition, the ``OneLineFormatOffRegex`` option gives you a concise way to +disable formatting for all of the lines that match the regular expression. + Configuring Style in Code ========================= @@ -6483,13 +6486,51 @@ the configuration (without a prefix: ``Auto``). .. _SpaceInEmptyBlock: **SpaceInEmptyBlock** (``Boolean``) :versionbadge:`clang-format 10` :ref:`¶ ` - If ``true``, spaces will be inserted into ``{}``. + This option is **deprecated**. See ``Block`` of ``SpaceInEmptyBraces``. + +.. _SpaceInEmptyBraces: + +**SpaceInEmptyBraces** (``SpaceInEmptyBracesStyle``) :versionbadge:`clang-format 22` :ref:`¶ ` + Specifies when to insert a space in empty braces. + + .. note:: + + This option doesn't apply to initializer braces if + ``Cpp11BracedListStyle`` is set to ``true``. + + Possible values: + + * ``SIEB_Always`` (in configuration: ``Always``) + Always insert a space in empty braces. + + .. code-block:: c++ + + void f() { } + class Unit { }; + auto a = [] { }; + int x{ }; + + * ``SIEB_Block`` (in configuration: ``Block``) + Only insert a space in empty blocks. + + .. code-block:: c++ + + void f() { } + class Unit { }; + auto a = [] { }; + int x{}; + + * ``SIEB_Never`` (in configuration: ``Never``) + Never insert a space in empty braces. + + .. code-block:: c++ + + void f() {} + class Unit {}; + auto a = [] {}; + int x{}; - .. code-block:: c++ - true: false: - void f() { } vs. void f() {} - while (true) { } while (true) {} .. _SpaceInEmptyParentheses: diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 604b4c3f714b..9ea9fcdf889d 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -193,6 +193,8 @@ Bug Fixes in This Version targets that treat ``_Float16``/``__fp16`` as native scalar types. Previously the warning was silently lost because the operands differed only by an implicit cast chain. (#GH149967). +- Fixed a crash with incompatible pointer to integer conversions in designated + initializers involving string literals. (#GH154046) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -230,6 +232,7 @@ Bug Fixes to AST Handling - Fix incorrect name qualifiers applied to alias CTAD. (#GH136624) - Fixed ElaboratedTypes appearing within NestedNameSpecifier, which was not a legal representation. This is fixed because ElaboratedTypes don't exist anymore. (#GH43179) (#GH68670) (#GH92757) +- Fix unrecognized html tag causing undesirable comment lexing (#GH152944) - Fix comment lexing of special command names (#GH152943) Miscellaneous Bug Fixes @@ -308,8 +311,12 @@ AST Matchers - Ensure ``hasBitWidth`` doesn't crash on bit widths that are dependent on template parameters. +- Add a boolean member ``IgnoreSystemHeaders`` to ``MatchFinderOptions``. This + allows it to ignore nodes in system headers when traversing the AST. + clang-format ------------ +- Add ``SpaceInEmptyBraces`` option and set it to ``Always`` for WebKit style. libclang -------- diff --git a/clang/docs/SourceBasedCodeCoverage.rst b/clang/docs/SourceBasedCodeCoverage.rst index 3e8642479a56..2f114070a8fb 100644 --- a/clang/docs/SourceBasedCodeCoverage.rst +++ b/clang/docs/SourceBasedCodeCoverage.rst @@ -66,17 +66,17 @@ supported. Uninstrumented code simply won't be accounted for in reports. To compile code with Modified Condition/Decision Coverage (MC/DC) enabled, pass ``-fcoverage-mcdc`` in addition to the clang options specified above. -MC/DC is an advanced form of code coverage most applicable in the embedded +MC/DC is an advanced form of code coverage most applicable to the embedded space. Running the instrumented program ================================ -The next step is to run the instrumented program. When the program exits it +The next step is to run the instrumented program. When the program exits, it will write a **raw profile** to the path specified by the ``LLVM_PROFILE_FILE`` environment variable. If that variable does not exist, the profile is written to ``default.profraw`` in the current directory of the program. If -``LLVM_PROFILE_FILE`` contains a path to a non-existent directory, the missing +``LLVM_PROFILE_FILE`` specifies a path to a non-existent directory, the missing directory structure will be created. Additionally, the following special **pattern strings** are rewritten: @@ -97,7 +97,7 @@ directory structure will be created. Additionally, the following special * "%b" expands out to the binary ID (build ID). It can be used with "%Nm" to avoid binary signature collisions. To use it, the program should be compiled with the build ID linker option (``--build-id`` for GNU ld or LLD, - ``/build-id`` for lld-link on Windows). Linux, Windows and AIX are supported. + ``/build-id`` for lld-link on Windows). Linux, Windows, and AIX are supported. * "%c" expands out to nothing, but enables a mode in which profile counter updates are continuously synced to a file. This means that if the @@ -128,7 +128,7 @@ and set bias to the offset between the original and the new counter location, at which point every subsequent counter access will be to the new location, which allows updating profile directly akin to the continuous mode. -The advantage of this approach is that doesn't require any special OS support. +The advantage of this approach is that it doesn't require any special OS support. The disadvantage is the extra overhead due to additional instructions required for each counter access (overhead both in terms of binary size and performance) plus duplication of counters (i.e. one copy in the binary itself and another @@ -137,7 +137,7 @@ other platforms by passing the ``-runtime-counter-relocation`` option to the backend during compilation. For a program such as the `Lit `_ -testing tool which invokes other programs, it may be necessary to set +testing tool, which invokes other programs, it may be necessary to set ``LLVM_PROFILE_FILE`` for each invocation. The pattern strings "%p" or "%Nm" may help to avoid corruption due to concurrency. Note that "%p" is also a Lit token and needs to be escaped as "%%p". @@ -149,7 +149,7 @@ token and needs to be escaped as "%%p". Creating coverage reports ========================= -Raw profiles have to be **indexed** before they can be used to generate +Raw profiles must be **indexed** before they can be used to generate coverage reports. This is done using the "merge" tool in ``llvm-profdata`` (which can combine multiple raw profiles and index them at the same time): @@ -240,13 +240,13 @@ line-oriented report, try: TOTAL 13 0 100.00% 3 0 100.00% 13 0 100.00% 12 2 83.33% The ``llvm-cov`` tool supports specifying a custom demangler, writing out -reports in a directory structure, and generating html reports. For the full +reports in a directory structure, and generating HTML reports. For the full list of options, please refer to the `command guide `_. A few final notes: -* The ``-sparse`` flag is optional but can result in dramatically smaller +* The ``-sparse`` flag is optional but can produce dramatically smaller indexed profiles. This option should not be used if the indexed profile will be reused for PGO. @@ -255,7 +255,7 @@ A few final notes: information directly into an existing raw profile on disk. The details are out of scope. -* The ``llvm-profdata`` tool can be used to merge together multiple raw or +* The ``llvm-profdata`` tool can be used to merge multiple raw or indexed profiles. To combine profiling data from multiple runs of a program, try e.g: @@ -299,7 +299,7 @@ There are six statistics tracked in a coverage summary: source code that may each evaluate to either "true" or "false". These conditions may comprise larger boolean expressions linked by boolean logical operators. For example, "x = (y == 2) || (z < 10)" is a boolean expression - that is comprised of two individual conditions, each of which evaluates to + comprised of two individual conditions, each of which evaluates to either true or false, producing four total branch outcomes. * Modified Condition/Decision Coverage (MC/DC) is the percentage of individual @@ -316,7 +316,7 @@ There are six statistics tracked in a coverage summary: ``-show-mcdc-summary`` option as long as code was also compiled using the clang option ``-fcoverage-mcdc``. - * Boolean expressions that are only comprised of one condition (and therefore + * Boolean expressions comprised of only one condition (and therefore have no logical operators) are not included in MC/DC analysis and are trivially deducible using branch coverage. @@ -366,7 +366,7 @@ By default the compiler runtime uses a static initializer to determine the profile output path and to register a writer function. To collect profiles without using static initializers, do this manually: -* Export a ``int __llvm_profile_runtime`` symbol from each instrumented shared +* Export an ``int __llvm_profile_runtime`` symbol from each instrumented shared library and executable. When the linker finds a definition of this symbol, it knows to skip loading the object which contains the profiling runtime's static initializer. @@ -380,7 +380,7 @@ without using static initializers, do this manually: to ``__llvm_profile_write_file``. * Forward-declare ``int __llvm_profile_write_file(void)`` and call it to write - out a profile. This function returns 0 when it succeeds, and a non-zero value + out a profile. This function returns 0 on success, and a non-zero value otherwise. Calling this function multiple times appends profile data to an existing on-disk raw profile. @@ -418,7 +418,7 @@ Collecting coverage reports for the llvm project ================================================ To prepare a coverage report for llvm (and any of its sub-projects), add -``-DLLVM_BUILD_INSTRUMENTED_COVERAGE=On`` to the cmake configuration. Raw +``-DLLVM_BUILD_INSTRUMENTED_COVERAGE=On`` to the CMake configuration. Raw profiles will be written to ``$BUILD_DIR/profiles/``. To prepare an html report, run ``llvm/utils/prepare-code-coverage-artifact.py``. @@ -429,7 +429,7 @@ To specify an alternate directory for raw profiles, use Drawbacks and limitations ========================= -* Prior to version 2.26, the GNU binutils BFD linker is not able link programs +* Prior to version 2.26, the GNU binutils BFD linker cannot link programs compiled with ``-fcoverage-mapping`` in its ``--gc-sections`` mode. Possible workarounds include disabling ``--gc-sections``, upgrading to a newer version of BFD, or using the Gold linker. diff --git a/clang/include/clang/AST/CommentHTMLTags.td b/clang/include/clang/AST/CommentHTMLTags.td index a1ce8c6da96c..9b89bc0c811f 100644 --- a/clang/include/clang/AST/CommentHTMLTags.td +++ b/clang/include/clang/AST/CommentHTMLTags.td @@ -51,6 +51,11 @@ def Col : Tag<"col"> { let EndTagForbidden = 1; } def Tr : Tag<"tr"> { let EndTagOptional = 1; } def Th : Tag<"th"> { let EndTagOptional = 1; } def Td : Tag<"td"> { let EndTagOptional = 1; } +def Summary : Tag<"summary">; +def Details : Tag<"details">; +def Mark : Tag<"mark">; +def Figure : Tag<"figure">; +def FigCaption : Tag<"figcaption">; // Define a list of attributes that are not safe to pass through to HTML // output if the input is untrusted. diff --git a/clang/include/clang/ASTMatchers/ASTMatchFinder.h b/clang/include/clang/ASTMatchers/ASTMatchFinder.h index 73cbcf1f2502..2d36e8c4fae1 100644 --- a/clang/include/clang/ASTMatchers/ASTMatchFinder.h +++ b/clang/include/clang/ASTMatchers/ASTMatchFinder.h @@ -135,10 +135,15 @@ public: llvm::StringMap &Records; }; + MatchFinderOptions() {} + /// Enables per-check timers. /// /// It prints a report after match. std::optional CheckProfiling; + + /// Avoids matching declarations in system headers. + bool IgnoreSystemHeaders{false}; }; MatchFinder(MatchFinderOptions Options = MatchFinderOptions()); diff --git a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h index 1c00558d32f6..7e1bfc903083 100644 --- a/clang/include/clang/Analysis/Analyses/LifetimeSafety.h +++ b/clang/include/clang/Analysis/Analyses/LifetimeSafety.h @@ -19,14 +19,35 @@ #define LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H #include "clang/Analysis/AnalysisDeclContext.h" #include "clang/Analysis/CFG.h" +#include "clang/Basic/SourceLocation.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/ImmutableMap.h" #include "llvm/ADT/ImmutableSet.h" #include "llvm/ADT/StringMap.h" #include namespace clang::lifetimes { +/// Enum to track the confidence level of a potential error. +enum class Confidence { + None, + Maybe, // Reported as a potential error (-Wlifetime-safety-strict) + Definite // Reported as a definite error (-Wlifetime-safety-permissive) +}; + +class LifetimeSafetyReporter { +public: + LifetimeSafetyReporter() = default; + virtual ~LifetimeSafetyReporter() = default; + + virtual void reportUseAfterFree(const Expr *IssueExpr, const Expr *UseExpr, + SourceLocation FreeLoc, + Confidence Confidence) {} +}; + /// The main entry point for the analysis. -void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC); +void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter); namespace internal { // Forward declarations of internal types. @@ -53,6 +74,7 @@ template struct ID { IDBuilder.AddInteger(Value); } }; + template inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, ID ID) { return OS << ID.Value; @@ -78,7 +100,8 @@ using ProgramPoint = const Fact *; /// encapsulates the various dataflow analyses. class LifetimeSafetyAnalysis { public: - LifetimeSafetyAnalysis(AnalysisDeclContext &AC); + LifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter); ~LifetimeSafetyAnalysis(); void run(); @@ -87,7 +110,7 @@ public: LoanSet getLoansAtPoint(OriginID OID, ProgramPoint PP) const; /// Returns the set of loans that have expired at a specific program point. - LoanSet getExpiredLoansAtPoint(ProgramPoint PP) const; + std::vector getExpiredLoansAtPoint(ProgramPoint PP) const; /// Finds the OriginID for a given declaration. /// Returns a null optional if not found. @@ -110,6 +133,7 @@ public: private: AnalysisDeclContext &AC; + LifetimeSafetyReporter *Reporter; std::unique_ptr Factory; std::unique_ptr FactMgr; std::unique_ptr LoanPropagation; @@ -118,4 +142,25 @@ private: } // namespace internal } // namespace clang::lifetimes +namespace llvm { +template +struct DenseMapInfo> { + using ID = clang::lifetimes::internal::ID; + + static inline ID getEmptyKey() { + return {DenseMapInfo::getEmptyKey()}; + } + + static inline ID getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey()}; + } + + static unsigned getHashValue(const ID &Val) { + return DenseMapInfo::getHashValue(Val.Value); + } + + static bool isEqual(const ID &LHS, const ID &RHS) { return LHS == RHS; } +}; +} // namespace llvm + #endif // LLVM_CLANG_ANALYSIS_ANALYSES_LIFETIMESAFETY_H diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h index 5be4a1145f40..11042e865c4e 100644 --- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h +++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysisContext.h @@ -42,6 +42,18 @@ struct ContextSensitiveOptions { unsigned Depth = 2; }; +/// A simple representation of essential elements of the logical context used in +/// environments. Designed for import/export for applications requiring +/// serialization support. +struct SimpleLogicalContext { + // Global invariant that applies for all definitions in the context. + const Formula *Invariant; + // Flow-condition tokens in the context. + llvm::DenseMap TokenDefs; + // Dependencies between flow-condition definitions. + llvm::DenseMap> TokenDeps; +}; + /// Owns objects that encompass the state of a program and stores context that /// is used during dataflow analysis. class DataflowAnalysisContext { @@ -140,6 +152,15 @@ public: /// Adds `Constraint` to the flow condition identified by `Token`. void addFlowConditionConstraint(Atom Token, const Formula &Constraint); + /// Adds `Deps` to the dependencies of the flow condition identified by + /// `Token`. Intended for use in deserializing contexts. The formula alone + /// doesn't have enough information to indicate its deps. + void addFlowConditionDeps(Atom Token, const llvm::DenseSet &Deps) { + // Avoid creating an entry for `Token` with an empty set. + if (!Deps.empty()) + FlowConditionDeps[Token].insert(Deps.begin(), Deps.end()); + } + /// Creates a new flow condition with the same constraints as the flow /// condition identified by `Token` and returns its token. Atom forkFlowCondition(Atom Token); @@ -207,6 +228,14 @@ public: return {}; } + /// Export the logical-context portions of `AC`, limited to the given target + /// flow-condition tokens. + SimpleLogicalContext + exportLogicalContext(llvm::DenseSet TargetTokens) const; + + /// Initializes this context's "logical" components with `LC`. + void initLogicalContext(SimpleLogicalContext LC); + private: friend class Environment; @@ -228,6 +257,11 @@ private: DataflowAnalysisContext(Solver &S, std::unique_ptr &&OwnedSolver, Options Opts); + /// Computes the transitive closure of dependencies of (flow-condition) + /// `Tokens`. That is, the set of flow-condition tokens reachable from + /// `Tokens` in the dependency graph. + llvm::DenseSet collectDependencies(llvm::DenseSet Tokens) const; + // Extends the set of modeled field declarations. void addModeledFields(const FieldSet &Fields); diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h index 097ff2bdfe7a..076714462bb2 100644 --- a/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h +++ b/clang/include/clang/Analysis/FlowSensitive/DataflowEnvironment.h @@ -157,10 +157,18 @@ public: }; /// Creates an environment that uses `DACtx` to store objects that encompass - /// the state of a program. + /// the state of a program. `FlowConditionToken` sets the flow condition + /// associated with the environment. Generally, new environments should be + /// initialized with a fresh token, by using one of the other + /// constructors. This constructor is for specialized use, including + /// deserialization and delegation from other constructors. + Environment(DataflowAnalysisContext &DACtx, Atom FlowConditionToken) + : DACtx(&DACtx), FlowConditionToken(FlowConditionToken) {} + + /// Creates an environment that uses `DACtx` to store objects that encompass + /// the state of a program. Populates a fresh atom as flow condition token. explicit Environment(DataflowAnalysisContext &DACtx) - : DACtx(&DACtx), - FlowConditionToken(DACtx.arena().makeFlowConditionToken()) {} + : Environment(DACtx, DACtx.arena().makeFlowConditionToken()) {} /// Creates an environment that uses `DACtx` to store objects that encompass /// the state of a program, with `S` as the statement to analyze. diff --git a/clang/include/clang/Analysis/FlowSensitive/Formula.h b/clang/include/clang/Analysis/FlowSensitive/Formula.h index 0e6352403a83..3959bc98619b 100644 --- a/clang/include/clang/Analysis/FlowSensitive/Formula.h +++ b/clang/include/clang/Analysis/FlowSensitive/Formula.h @@ -85,21 +85,17 @@ public: } using AtomNames = llvm::DenseMap; - // Produce a stable human-readable representation of this formula. - // For example: (V3 | !(V1 & V2)) - // If AtomNames is provided, these override the default V0, V1... names. + /// Produces a stable human-readable representation of this formula. + /// For example: (V3 | !(V1 & V2)) + /// If AtomNames is provided, these override the default V0, V1... names. void print(llvm::raw_ostream &OS, const AtomNames * = nullptr) const; - // Allocate Formulas using Arena rather than calling this function directly. + /// Allocates Formulas using Arena rather than calling this function directly. static const Formula &create(llvm::BumpPtrAllocator &Alloc, Kind K, ArrayRef Operands, unsigned Value = 0); -private: - Formula() = default; - Formula(const Formula &) = delete; - Formula &operator=(const Formula &) = delete; - + /// Count of operands (sub-formulas) associated with Formulas of kind `K`. static unsigned numOperands(Kind K) { switch (K) { case AtomRef: @@ -116,6 +112,11 @@ private: llvm_unreachable("Unhandled Formula::Kind enum"); } +private: + Formula() = default; + Formula(const Formula &) = delete; + Formula &operator=(const Formula &) = delete; + Kind FormulaKind; // Some kinds of formula have scalar values, e.g. AtomRef's atom number. unsigned Value; diff --git a/clang/include/clang/Analysis/FlowSensitive/FormulaSerialization.h b/clang/include/clang/Analysis/FlowSensitive/FormulaSerialization.h new file mode 100644 index 000000000000..119f93e5d73f --- /dev/null +++ b/clang/include/clang/Analysis/FlowSensitive/FormulaSerialization.h @@ -0,0 +1,40 @@ +//=== FormulaSerialization.h - Formula De/Serialization support -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_FORMULA_SERIALIZATION_H +#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_FORMULA_SERIALIZATION_H + +#include "clang/Analysis/FlowSensitive/Arena.h" +#include "clang/Analysis/FlowSensitive/Formula.h" +#include "clang/Basic/LLVM.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +namespace clang::dataflow { + +/// Prints `F` to `OS` in a compact format, optimized for easy parsing +/// (deserialization) rather than human use. +void serializeFormula(const Formula &F, llvm::raw_ostream &OS); + +/// Parses `Str` to build a serialized Formula. +/// @returns error on parse failure or if parsing does not fully consume `Str`. +/// @param A used to construct the formula components. +/// @param AtomMap maps serialized Atom identifiers (unsigned ints) to Atoms. +/// This map is provided by the caller to enable consistency across +/// multiple formulas in a single file. +llvm::Expected +parseFormula(llvm::StringRef Str, Arena &A, + llvm::DenseMap &AtomMap); + +} // namespace clang::dataflow +#endif diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 4262bdaa7cdd..cc1da937455a 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -268,7 +268,6 @@ let Header = "emmintrin.h", Attributes = [NoThrow, RequireDeclaration] in { } let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { - def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; def psraw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def psrad128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; def psrlw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; @@ -290,6 +289,10 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">; } +let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pmuludq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; +} + let Features = "sse3", Attributes = [NoThrow] in { def monitor : X86Builtin<"void(void const *, unsigned int, unsigned int)">; def mwait : X86Builtin<"void(unsigned int, unsigned int)">; @@ -312,7 +315,6 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">; def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">; def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; - def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; @@ -329,6 +331,10 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] def vec_set_v4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int, _Constant int)">; } +let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pmuldq128 : X86Builtin<"_Vector<2, long long int>(_Vector<4, int>, _Vector<4, int>)">; +} + let Features = "sse4.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def pcmpistrm128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant char)">; def pcmpistri128 : X86Builtin<"int(_Vector<16, char>, _Vector<16, char>, _Constant char)">; @@ -580,9 +586,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">; def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">; def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; - def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; - def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; def pshufd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">; @@ -620,6 +624,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">; } +let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; + def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; +} + let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">; def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; @@ -1078,6 +1087,9 @@ let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWi def cvtpd2ps512_mask : X86Builtin<"_Vector<8, float>(_Vector<8, double>, _Vector<8, float>, unsigned char, _Constant int)">; def vcvtps2ph512_mask : X86Builtin<"_Vector<16, short>(_Vector<16, float>, _Constant int, _Vector<16, short>, unsigned short)">; def vcvtph2ps512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, short>, _Vector<16, float>, unsigned short, _Constant int)">; +} + +let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def pmuldq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">; def pmuludq512 : X86Builtin<"_Vector<8, long long int>(_Vector<16, int>, _Vector<16, int>)">; } @@ -4118,99 +4130,99 @@ let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVecto def vfcmulcph512_mask : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Vector<16, float>, unsigned short, _Constant int)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectb_128 : X86Builtin<"_Vector<16, char>(unsigned short, _Vector<16, char>, _Vector<16, char>)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectb_256 : X86Builtin<"_Vector<32, char>(unsigned int, _Vector<32, char>, _Vector<32, char>)">; } -let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectb_512 : X86Builtin<"_Vector<64, char>(unsigned long long int, _Vector<64, char>, _Vector<64, char>)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectw_128 : X86Builtin<"_Vector<8, short>(unsigned char, _Vector<8, short>, _Vector<8, short>)">; } -let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512bw,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectw_256 : X86Builtin<"_Vector<16, short>(unsigned short, _Vector<16, short>, _Vector<16, short>)">; } -let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectw_512 : X86Builtin<"_Vector<32, short>(unsigned int, _Vector<32, short>, _Vector<32, short>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectd_128 : X86Builtin<"_Vector<4, int>(unsigned char, _Vector<4, int>, _Vector<4, int>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectd_256 : X86Builtin<"_Vector<8, int>(unsigned char, _Vector<8, int>, _Vector<8, int>)">; } -let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectd_512 : X86Builtin<"_Vector<16, int>(unsigned short, _Vector<16, int>, _Vector<16, int>)">; } -let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectph_128 : X86Builtin<"_Vector<8, _Float16>(unsigned char, _Vector<8, _Float16>, _Vector<8, _Float16>)">; } -let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512fp16,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectph_256 : X86Builtin<"_Vector<16, _Float16>(unsigned short, _Vector<16, _Float16>, _Vector<16, _Float16>)">; } -let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512fp16,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectph_512 : X86Builtin<"_Vector<32, _Float16>(unsigned int, _Vector<32, _Float16>, _Vector<32, _Float16>)">; } -let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectpbf_128 : X86Builtin<"_Vector<8, __bf16>(unsigned char, _Vector<8, __bf16>, _Vector<8, __bf16>)">; } -let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512bf16,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectpbf_256 : X86Builtin<"_Vector<16, __bf16>(unsigned short, _Vector<16, __bf16>, _Vector<16, __bf16>)">; } -let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512bf16,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectpbf_512 : X86Builtin<"_Vector<32, __bf16>(unsigned int, _Vector<32, __bf16>, _Vector<32, __bf16>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectq_128 : X86Builtin<"_Vector<2, long long int>(unsigned char, _Vector<2, long long int>, _Vector<2, long long int>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectq_256 : X86Builtin<"_Vector<4, long long int>(unsigned char, _Vector<4, long long int>, _Vector<4, long long int>)">; } -let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectq_512 : X86Builtin<"_Vector<8, long long int>(unsigned char, _Vector<8, long long int>, _Vector<8, long long int>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectps_128 : X86Builtin<"_Vector<4, float>(unsigned char, _Vector<4, float>, _Vector<4, float>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectps_256 : X86Builtin<"_Vector<8, float>(unsigned char, _Vector<8, float>, _Vector<8, float>)">; } -let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectps_512 : X86Builtin<"_Vector<16, float>(unsigned short, _Vector<16, float>, _Vector<16, float>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { def selectpd_128 : X86Builtin<"_Vector<2, double>(unsigned char, _Vector<2, double>, _Vector<2, double>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def selectpd_256 : X86Builtin<"_Vector<4, double>(unsigned char, _Vector<4, double>, _Vector<4, double>)">; } -let Features = "avx512f,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def selectpd_512 : X86Builtin<"_Vector<8, double>(unsigned char, _Vector<8, double>, _Vector<8, double>)">; } diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index ccb18aa37447..2edf4da43536 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -533,7 +533,14 @@ def Dangling : DiagGroup<"dangling", [DanglingAssignment, DanglingGsl, ReturnStackAddress]>; -def LifetimeSafety : DiagGroup<"experimental-lifetime-safety">; +def LifetimeSafetyPermissive : DiagGroup<"experimental-lifetime-safety-permissive">; +def LifetimeSafetyStrict : DiagGroup<"experimental-lifetime-safety-strict">; +def LifetimeSafety : DiagGroup<"experimental-lifetime-safety", + [LifetimeSafetyPermissive, LifetimeSafetyStrict]> { + code Documentation = [{ + Experimental warnings to detect use-after-free and related temporal safety bugs based on lifetime safety analysis. + }]; +} def DistributedObjectModifiers : DiagGroup<"distributed-object-modifiers">; def DllexportExplicitInstantiationDecl : DiagGroup<"dllexport-explicit-instantiation-decl">; diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index a7f3d3782307..c733e8823cea 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10671,9 +10671,15 @@ def warn_dangling_reference_captured_by_unknown : Warning< "object whose reference is captured will be destroyed at the end of " "the full-expression">, InGroup; -def warn_experimental_lifetime_safety_dummy_warning : Warning< - "todo: remove this warning after we have atleast one warning based on the lifetime analysis">, - InGroup, DefaultIgnore; +// Diagnostics based on the Lifetime safety analysis. +def warn_lifetime_safety_loan_expires_permissive : Warning< + "object whose reference is captured does not live long enough">, + InGroup, DefaultIgnore; +def warn_lifetime_safety_loan_expires_strict : Warning< + "object whose reference is captured may not live long enough">, + InGroup, DefaultIgnore; +def note_lifetime_safety_used_here : Note<"later used here">; +def note_lifetime_safety_destroyed_here : Note<"destroyed here">; // For non-floating point, expressions of the form x == x or x != x // should result in a warning, since these always evaluate to a constant. diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h index ce4677e54022..25b68622656f 100644 --- a/clang/include/clang/Basic/TargetInfo.h +++ b/clang/include/clang/Basic/TargetInfo.h @@ -233,8 +233,9 @@ protected: bool TLSSupported; bool VLASupported; bool NoAsmVariants; // True if {|} are normal characters. - bool HasLegalHalfType; // True if the backend supports operations on the half - // LLVM IR type. + bool HasFastHalfType; // True if the backend has native half float support, + // and performing calculations in float instead does + // not have a performance advantage. bool HalfArgsAndReturns; // OpenCL 6.1.1.1, NEON (IEEE 754-2008 half) type. bool HasFloat128; bool HasFloat16; @@ -700,8 +701,9 @@ public: return 128; } - /// Determine whether _Float16 is supported on this target. - virtual bool hasLegalHalfType() const { return HasLegalHalfType; } + /// Determine whether the target has fast native support for operations + /// on half types. + virtual bool hasFastHalfType() const { return HasFastHalfType; } /// Whether half args and returns are supported. virtual bool allowHalfArgsAndReturns() const { return HalfArgsAndReturns; } diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index 0bf3cb26be85..6244d3430026 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -504,8 +504,7 @@ public: static OpBuilder::InsertPoint getBestAllocaInsertPoint(mlir::Block *block) { auto last = std::find_if(block->rbegin(), block->rend(), [](mlir::Operation &op) { - // TODO: Add LabelOp missing feature here - return mlir::isa(&op); + return mlir::isa(&op); }); if (last != block->rend()) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index a77e9199cdc9..129a6760c935 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -1060,6 +1060,62 @@ def CIR_BrOp : CIR_Op<"br",[ }]; } +//===----------------------------------------------------------------------===// +// GotoOp +//===----------------------------------------------------------------------===// + +def CIR_GotoOp : CIR_Op<"goto", [Terminator]> { + let description = [{ + + Transfers control to the specified `label`. This requires a corresponding + `cir.label` to exist and is used by to represent source level `goto`s + that jump across region boundaries. Alternatively, `cir.br` is used to + construct goto's that don't violate such boundaries. + + `cir.goto` is completely symbolic (i.e. it "jumps" on a label that isn't + yet materialized) and should be taken into account by passes and analysis + when deciding if it's safe to make some assumptions about a given region + or basic block. + + Example: + ```C++ + int test(int x) { + if (x) + goto label; + { + x = 10; + label: + return x; + } + } + ``` + + ```mlir + cir.scope { // REGION #1 + %2 = cir.load %0 : !cir.ptr, !s32i + %3 = cir.cast(int_to_bool, %2 : !s32i), !cir.bool + cir.if %3 { + cir.goto "label" + } + } + cir.scope { // REGION #2 + %2 = cir.const #cir.int<10> : !s32i + cir.store %2, %0 : !s32i, !cir.ptr + cir.br ^bb1 + ^bb1: // pred: ^bb0 + cir.label "label" + %3 = cir.load %0 : !cir.ptr, !s32i + cir.store %3, %1 : !s32i, !cir.ptr + %4 = cir.load %1 : !cir.ptr, !s32i + cir.return %4 : !s32i + } + cir.unreachable + ``` + }]; + let arguments = (ins StrAttr:$label); + let assemblyFormat = [{ $label attr-dict }]; +} + //===----------------------------------------------------------------------===// // LabelOp //===----------------------------------------------------------------------===// @@ -1749,6 +1805,39 @@ def CIR_VTableAddrPointOp : CIR_Op<"vtable.address_point", [ }]; } +//===----------------------------------------------------------------------===// +// VTableGetVPtr +//===----------------------------------------------------------------------===// + +def CIR_VTableGetVPtrOp : CIR_Op<"vtable.get_vptr", [Pure]> { + let summary = "Get a the address of the vtable pointer for an object"; + let description = [{ + The `vtable.get_vptr` operation retrieves the address of the vptr for a + C++ object. This operation requires that the object pointer points to + the start of a complete object. (TODO: Describe how we get that). + The vptr will always be at offset zero in the object, but this operation + is more explicit about what is being retrieved than a direct bitcast. + + The return type is always `!cir.ptr`. + + Example: + ```mlir + %2 = cir.load %0 : !cir.ptr>, !cir.ptr + %3 = cir.vtable.get_vptr %2 : !cir.ptr -> !cir.ptr + ``` + }]; + + let arguments = (ins + Arg:$src + ); + + let results = (outs CIR_PtrToVPtr:$result); + + let assemblyFormat = [{ + $src `:` qualified(type($src)) `->` qualified(type($result)) attr-dict + }]; +} + //===----------------------------------------------------------------------===// // SetBitfieldOp //===----------------------------------------------------------------------===// @@ -2210,6 +2299,68 @@ def CIR_CallOp : CIR_CallOpBase<"call", [NoRegionArguments]> { ]; } +//===----------------------------------------------------------------------===// +// ReturnAddrOp and FrameAddrOp +//===----------------------------------------------------------------------===// + +class CIR_FuncAddrBuiltinOp : CIR_Op { + let arguments = (ins CIR_UInt32:$level); + let results = (outs CIR_VoidPtrType:$result); + let assemblyFormat = [{ + `(` $level `)` attr-dict + }]; +} + +def CIR_ReturnAddrOp : CIR_FuncAddrBuiltinOp<"return_address"> { + let summary = + "The return address of the current function, or of one of its callers"; + + let description = [{ + Represents a call to builtin function ` __builtin_return_address` in CIR. + This builtin function returns the return address of the current function, + or of one of its callers. + + The `level` argument is number of frames to scan up the call stack. + For instance, value of 0 yields the return address of the current function, + value of 1 yields the return address of the caller of the current function, + and so forth. + + Examples: + + ```mlir + %p = return_address(%level) -> !cir.ptr + ``` + }]; +} + +def CIR_FrameAddrOp : CIR_FuncAddrBuiltinOp<"frame_address"> { + let summary = + "The frame address of the current function, or of one of its callers"; + + let description = [{ + Represents a call to builtin function ` __builtin_frame_address` in CIR. + This builtin function returns the frame address of the current function, + or of one of its callers. The frame is the area on the stack that holds + local variables and saved registers. The frame address is normally the + address of the first word pushed on to the stack by the function. + However, the exact definition depends upon the processor and the calling + convention. If the processor has a dedicated frame pointer register, and + the function has a frame, then __builtin_frame_address returns the value of + the frame pointer register. + + The `level` argument is number of frames to scan up the call stack. + For instance, value of 0 yields the frame address of the current function, + value of 1 yields the frame address of the caller of the current function, + and so forth. + + Examples: + + ```mlir + %p = frame_address(%level) -> !cir.ptr + ``` + }]; +} + //===----------------------------------------------------------------------===// // StackSaveOp & StackRestoreOp //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td index d7d55dfbc065..82f6e1d33043 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td @@ -289,6 +289,14 @@ def CIR_AnyFloatOrVecOfFloatType let cppFunctionName = "isFPOrVectorOfFPType"; } +//===----------------------------------------------------------------------===// +// VPtr type predicates +//===----------------------------------------------------------------------===// + +def CIR_AnyVPtrType : CIR_TypeBase<"::cir::VPtrType", "vptr type">; + +def CIR_PtrToVPtr : CIR_PtrToType; + //===----------------------------------------------------------------------===// // Scalar Type predicates //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td index a258df79a618..312d0a942267 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td @@ -296,10 +296,10 @@ def CIR_VPtrType : CIR_Type<"VPtr", "vptr", [ access to the vptr. This type will be the element type of the 'vptr' member of structures that - require a vtable pointer. A pointer to this type is returned by the - `cir.vtable.address_point` and `cir.vtable.get_vptr` operations, and this - pointer may be passed to the `cir.vtable.get_virtual_fn_addr` operation to - get the address of a virtual function pointer. + require a vtable pointer. The `cir.vtable.address_point` operation returns + this type. The `cir.vtable.get_vptr` operations returns a pointer to this + type. This pointer may be passed to the `cir.vtable.get_virtual_fn_addr` + operation to get the address of a virtual function pointer. The pointer may also be cast to other pointer types in order to perform pointer arithmetic based on information encoded in the AST layout to get diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 858f37c39210..6a2f4575459b 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6987,7 +6987,6 @@ def static_libgfortran : Flag<["-"], "static-libgfortran">, Group, Group; def finit_character_EQ : Joined<["-"], "finit-character=">, Group; @@ -8695,6 +8694,15 @@ def fopenmp_host_ir_file_path : Separate<["-"], "fopenmp-host-ir-file-path">, } // let Visibility = [CC1Option, FC1Option] +//===----------------------------------------------------------------------===// +// Coarray Options +//===----------------------------------------------------------------------===// + +def fcoarray : Flag<["-"], "fcoarray">, + Group, + Visibility<[FlangOption, FC1Option]>, + HelpText<"Enable Coarray features">; + //===----------------------------------------------------------------------===// // SYCL Options //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h index 31582a40de86..5dfdb2359461 100644 --- a/clang/include/clang/Format/Format.h +++ b/clang/include/clang/Format/Format.h @@ -4813,14 +4813,45 @@ struct FormatStyle { /// \version 7 bool SpaceBeforeRangeBasedForLoopColon; - /// If ``true``, spaces will be inserted into ``{}``. - /// \code - /// true: false: - /// void f() { } vs. void f() {} - /// while (true) { } while (true) {} - /// \endcode + /// This option is **deprecated**. See ``Block`` of ``SpaceInEmptyBraces``. /// \version 10 - bool SpaceInEmptyBlock; + // bool SpaceInEmptyBlock; + + /// Style of when to insert a space in empty braces. + enum SpaceInEmptyBracesStyle : int8_t { + /// Always insert a space in empty braces. + /// \code + /// void f() { } + /// class Unit { }; + /// auto a = [] { }; + /// int x{ }; + /// \endcode + SIEB_Always, + /// Only insert a space in empty blocks. + /// \code + /// void f() { } + /// class Unit { }; + /// auto a = [] { }; + /// int x{}; + /// \endcode + SIEB_Block, + /// Never insert a space in empty braces. + /// \code + /// void f() {} + /// class Unit {}; + /// auto a = [] {}; + /// int x{}; + /// \endcode + SIEB_Never + }; + + /// Specifies when to insert a space in empty braces. + /// \note + /// This option doesn't apply to initializer braces if + /// ``Cpp11BracedListStyle`` is set to ``true``. + /// \endnote + /// \version 22 + SpaceInEmptyBracesStyle SpaceInEmptyBraces; /// If ``true``, spaces may be inserted into ``()``. /// This option is **deprecated**. See ``InEmptyParentheses`` of @@ -5494,7 +5525,7 @@ struct FormatStyle { SpaceBeforeRangeBasedForLoopColon == R.SpaceBeforeRangeBasedForLoopColon && SpaceBeforeSquareBrackets == R.SpaceBeforeSquareBrackets && - SpaceInEmptyBlock == R.SpaceInEmptyBlock && + SpaceInEmptyBraces == R.SpaceInEmptyBraces && SpacesBeforeTrailingComments == R.SpacesBeforeTrailingComments && SpacesInAngles == R.SpacesInAngles && SpacesInContainerLiterals == R.SpacesInContainerLiterals && diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h index 06971ff87ab9..423f2ffe2f85 100644 --- a/clang/include/clang/Lex/Lexer.h +++ b/clang/include/clang/Lex/Lexer.h @@ -143,9 +143,6 @@ class Lexer : public PreprocessorLexer { /// True if this is the first time we're lexing the input file. bool IsFirstTimeLexingFile; - /// True if current lexing token is the first pp-token. - bool IsFirstPPToken; - // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n', // it also points to '\n.' const char *NewLinePtr; diff --git a/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h b/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h new file mode 100644 index 000000000000..9ab3c6a528a1 --- /dev/null +++ b/clang/include/clang/Lex/NoTrivialPPDirectiveTracer.h @@ -0,0 +1,310 @@ +//===--- NoTrivialPPDirectiveTracer.h ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the NoTrivialPPDirectiveTracer interface. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H +#define LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H + +#include "clang/Lex/PPCallbacks.h" + +namespace clang { +class Preprocessor; + +/// Consider the following code: +/// +/// # 1 __FILE__ 1 3 +/// export module a; +/// +/// According to the wording in +/// [P1857R3](https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p1857r3.html): +/// +/// A module directive may only appear as the first preprocessing tokens in a +/// file (excluding the global module fragment.) +/// +/// and the wording in +/// [[cpp.pre]](https://eel.is/c++draft/cpp.pre#nt:module-file): +/// module-file: +/// pp-global-module-fragment[opt] pp-module group[opt] +/// pp-private-module-fragment[opt] +/// +/// `#` is the first pp-token in the translation unit, and it was rejected by +/// clang, but they really should be exempted from this rule. The goal is to not +/// allow any preprocessor conditionals or most state changes, but these don't +/// fit that. +/// +/// State change would mean most semantically observable preprocessor state, +/// particularly anything that is order dependent. Global flags like being a +/// system header/module shouldn't matter. +/// +/// We should exempt a brunch of directives, even though it violates the current +/// standard wording. +/// +/// This class used to trace 'no-trivial' pp-directives in main file, which may +/// change the preprocessing state. +/// +/// FIXME: Once the wording of the standard is revised, we need to follow the +/// wording of the standard. Currently this is just a workaround +class NoTrivialPPDirectiveTracer : public PPCallbacks { + Preprocessor &PP; + + /// Whether preprocessing main file. We only focus on the main file. + bool InMainFile = true; + + /// Whether one or more conditional, include or other 'no-trivial' + /// pp-directives has seen before. + bool SeenNoTrivialPPDirective = false; + + void setSeenNoTrivialPPDirective(); + +public: + NoTrivialPPDirectiveTracer(Preprocessor &P) : PP(P) {} + + bool hasSeenNoTrivialPPDirective() const; + + /// Callback invoked whenever the \p Lexer moves to a different file for + /// lexing. Unlike \p FileChanged line number directives and other related + /// pragmas do not trigger callbacks to \p LexedFileChanged. + /// + /// \param FID The \p FileID that the \p Lexer moved to. + /// + /// \param Reason Whether the \p Lexer entered a new file or exited one. + /// + /// \param FileType The \p CharacteristicKind of the file the \p Lexer moved + /// to. + /// + /// \param PrevFID The \p FileID the \p Lexer was using before the change. + /// + /// \param Loc The location where the \p Lexer entered a new file from or the + /// location that the \p Lexer moved into after exiting a file. + void LexedFileChanged(FileID FID, LexedFileChangeReason Reason, + SrcMgr::CharacteristicKind FileType, FileID PrevFID, + SourceLocation Loc) override; + + /// Callback invoked whenever an embed directive has been processed, + /// regardless of whether the embed will actually find a file. + /// + /// \param HashLoc The location of the '#' that starts the embed directive. + /// + /// \param FileName The name of the file being included, as written in the + /// source code. + /// + /// \param IsAngled Whether the file name was enclosed in angle brackets; + /// otherwise, it was enclosed in quotes. + /// + /// \param File The actual file that may be included by this embed directive. + /// + /// \param Params The parameters used by the directive. + void EmbedDirective(SourceLocation HashLoc, StringRef FileName, bool IsAngled, + OptionalFileEntryRef File, + const LexEmbedParametersResult &Params) override { + setSeenNoTrivialPPDirective(); + } + + /// Callback invoked whenever an inclusion directive of + /// any kind (\c \#include, \c \#import, etc.) has been processed, regardless + /// of whether the inclusion will actually result in an inclusion. + /// + /// \param HashLoc The location of the '#' that starts the inclusion + /// directive. + /// + /// \param IncludeTok The token that indicates the kind of inclusion + /// directive, e.g., 'include' or 'import'. + /// + /// \param FileName The name of the file being included, as written in the + /// source code. + /// + /// \param IsAngled Whether the file name was enclosed in angle brackets; + /// otherwise, it was enclosed in quotes. + /// + /// \param FilenameRange The character range of the quotes or angle brackets + /// for the written file name. + /// + /// \param File The actual file that may be included by this inclusion + /// directive. + /// + /// \param SearchPath Contains the search path which was used to find the file + /// in the file system. If the file was found via an absolute include path, + /// SearchPath will be empty. For framework includes, the SearchPath and + /// RelativePath will be split up. For example, if an include of "Some/Some.h" + /// is found via the framework path + /// "path/to/Frameworks/Some.framework/Headers/Some.h", SearchPath will be + /// "path/to/Frameworks/Some.framework/Headers" and RelativePath will be + /// "Some.h". + /// + /// \param RelativePath The path relative to SearchPath, at which the include + /// file was found. This is equal to FileName except for framework includes. + /// + /// \param SuggestedModule The module suggested for this header, if any. + /// + /// \param ModuleImported Whether this include was translated into import of + /// \p SuggestedModule. + /// + /// \param FileType The characteristic kind, indicates whether a file or + /// directory holds normal user code, system code, or system code which is + /// implicitly 'extern "C"' in C++ mode. + /// + void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, + StringRef FileName, bool IsAngled, + CharSourceRange FilenameRange, + OptionalFileEntryRef File, StringRef SearchPath, + StringRef RelativePath, const Module *SuggestedModule, + bool ModuleImported, + SrcMgr::CharacteristicKind FileType) override { + setSeenNoTrivialPPDirective(); + } + + /// Callback invoked whenever there was an explicit module-import + /// syntax. + /// + /// \param ImportLoc The location of import directive token. + /// + /// \param Path The identifiers (and their locations) of the module + /// "path", e.g., "std.vector" would be split into "std" and "vector". + /// + /// \param Imported The imported module; can be null if importing failed. + /// + void moduleImport(SourceLocation ImportLoc, ModuleIdPath Path, + const Module *Imported) override { + setSeenNoTrivialPPDirective(); + } + + /// Callback invoked when the end of the main file is reached. + /// + /// No subsequent callbacks will be made. + void EndOfMainFile() override { setSeenNoTrivialPPDirective(); } + + /// Callback invoked when start reading any pragma directive. + void PragmaDirective(SourceLocation Loc, + PragmaIntroducerKind Introducer) override {} + + /// Called by Preprocessor::HandleMacroExpandedIdentifier when a + /// macro invocation is found. + void MacroExpands(const Token &MacroNameTok, const MacroDefinition &MD, + SourceRange Range, const MacroArgs *Args) override; + + /// Hook called whenever a macro definition is seen. + void MacroDefined(const Token &MacroNameTok, + const MacroDirective *MD) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever a macro \#undef is seen. + /// \param MacroNameTok The active Token + /// \param MD A MacroDefinition for the named macro. + /// \param Undef New MacroDirective if the macro was defined, null otherwise. + /// + /// MD is released immediately following this callback. + void MacroUndefined(const Token &MacroNameTok, const MacroDefinition &MD, + const MacroDirective *Undef) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever the 'defined' operator is seen. + /// \param MD The MacroDirective if the name was a macro, null otherwise. + void Defined(const Token &MacroNameTok, const MacroDefinition &MD, + SourceRange Range) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#if is seen. + /// \param Loc the source location of the directive. + /// \param ConditionRange The SourceRange of the expression being tested. + /// \param ConditionValue The evaluated value of the condition. + /// + // FIXME: better to pass in a list (or tree!) of Tokens. + void If(SourceLocation Loc, SourceRange ConditionRange, + ConditionValueKind ConditionValue) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#elif is seen. + /// \param Loc the source location of the directive. + /// \param ConditionRange The SourceRange of the expression being tested. + /// \param ConditionValue The evaluated value of the condition. + /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive. + // FIXME: better to pass in a list (or tree!) of Tokens. + void Elif(SourceLocation Loc, SourceRange ConditionRange, + ConditionValueKind ConditionValue, SourceLocation IfLoc) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#ifdef is seen. + /// \param Loc the source location of the directive. + /// \param MacroNameTok Information on the token being tested. + /// \param MD The MacroDefinition if the name was a macro, null otherwise. + void Ifdef(SourceLocation Loc, const Token &MacroNameTok, + const MacroDefinition &MD) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#elifdef branch is taken. + /// \param Loc the source location of the directive. + /// \param MacroNameTok Information on the token being tested. + /// \param MD The MacroDefinition if the name was a macro, null otherwise. + void Elifdef(SourceLocation Loc, const Token &MacroNameTok, + const MacroDefinition &MD) override { + setSeenNoTrivialPPDirective(); + } + /// Hook called whenever an \#elifdef is skipped. + /// \param Loc the source location of the directive. + /// \param ConditionRange The SourceRange of the expression being tested. + /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive. + // FIXME: better to pass in a list (or tree!) of Tokens. + void Elifdef(SourceLocation Loc, SourceRange ConditionRange, + SourceLocation IfLoc) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#ifndef is seen. + /// \param Loc the source location of the directive. + /// \param MacroNameTok Information on the token being tested. + /// \param MD The MacroDefiniton if the name was a macro, null otherwise. + void Ifndef(SourceLocation Loc, const Token &MacroNameTok, + const MacroDefinition &MD) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#elifndef branch is taken. + /// \param Loc the source location of the directive. + /// \param MacroNameTok Information on the token being tested. + /// \param MD The MacroDefinition if the name was a macro, null otherwise. + void Elifndef(SourceLocation Loc, const Token &MacroNameTok, + const MacroDefinition &MD) override { + setSeenNoTrivialPPDirective(); + } + /// Hook called whenever an \#elifndef is skipped. + /// \param Loc the source location of the directive. + /// \param ConditionRange The SourceRange of the expression being tested. + /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive. + // FIXME: better to pass in a list (or tree!) of Tokens. + void Elifndef(SourceLocation Loc, SourceRange ConditionRange, + SourceLocation IfLoc) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#else is seen. + /// \param Loc the source location of the directive. + /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive. + void Else(SourceLocation Loc, SourceLocation IfLoc) override { + setSeenNoTrivialPPDirective(); + } + + /// Hook called whenever an \#endif is seen. + /// \param Loc the source location of the directive. + /// \param IfLoc the source location of the \#if/\#ifdef/\#ifndef directive. + void Endif(SourceLocation Loc, SourceLocation IfLoc) override { + setSeenNoTrivialPPDirective(); + } +}; + +} // namespace clang + +#endif // LLVM_CLANG_LEX_NO_TRIVIAL_PPDIRECTIVE_TRACER_H diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h index 71b0f8eab3bf..39754847a93e 100644 --- a/clang/include/clang/Lex/Preprocessor.h +++ b/clang/include/clang/Lex/Preprocessor.h @@ -82,6 +82,7 @@ class PreprocessorLexer; class PreprocessorOptions; class ScratchBuffer; class TargetInfo; +class NoTrivialPPDirectiveTracer; namespace Builtin { class Context; @@ -353,6 +354,11 @@ private: /// First pp-token source location in current translation unit. SourceLocation FirstPPTokenLoc; + /// A preprocessor directive tracer to trace whether the preprocessing + /// state changed. These changes would mean most semantically observable + /// preprocessor state, particularly anything that is order dependent. + NoTrivialPPDirectiveTracer *DirTracer = nullptr; + /// A position within a C++20 import-seq. class StdCXXImportSeq { public: @@ -609,6 +615,8 @@ private: return State == NamedModuleImplementation && !getName().contains(':'); } + bool isNotAModuleDecl() const { return State == NotAModuleDecl; } + StringRef getName() const { assert(isNamedModule() && "Can't get name from a non named module"); return Name; @@ -3091,6 +3099,10 @@ public: bool setDeserializedSafeBufferOptOutMap( const SmallVectorImpl &SrcLocSeqs); + /// Whether we've seen pp-directives which may have changed the preprocessing + /// state. + bool hasSeenNoTrivialPPDirective() const; + private: /// Helper functions to forward lexing to the actual lexer. They all share the /// same signature. diff --git a/clang/include/clang/Lex/Token.h b/clang/include/clang/Lex/Token.h index fc43e72593b9..d9dc5a562d80 100644 --- a/clang/include/clang/Lex/Token.h +++ b/clang/include/clang/Lex/Token.h @@ -86,12 +86,12 @@ public: // macro stringizing or charizing operator. CommaAfterElided = 0x200, // The comma following this token was elided (MS). IsEditorPlaceholder = 0x400, // This identifier is a placeholder. - - IsReinjected = 0x800, // A phase 4 token that was produced before and - // re-added, e.g. via EnterTokenStream. Annotation - // tokens are *not* reinjected. - FirstPPToken = 0x1000, // This token is the first pp token in the - // translation unit. + IsReinjected = 0x800, // A phase 4 token that was produced before and + // re-added, e.g. via EnterTokenStream. Annotation + // tokens are *not* reinjected. + HasSeenNoTrivialPPDirective = + 0x1000, // Whether we've seen any 'no-trivial' pp-directives before + // current position. }; tok::TokenKind getKind() const { return Kind; } @@ -321,8 +321,9 @@ public: /// lexer uses identifier tokens to represent placeholders. bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); } - /// Returns true if this token is the first pp-token. - bool isFirstPPToken() const { return getFlag(FirstPPToken); } + bool hasSeenNoTrivialPPDirective() const { + return getFlag(HasSeenNoTrivialPPDirective); + } }; /// Information about the conditional stack (\#if directives) diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index f93380345060..da9070842694 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -9837,7 +9837,7 @@ public: SourceLocation ModuleLoc, ModuleDeclKind MDK, ModuleIdPath Path, ModuleIdPath Partition, ModuleImportState &ImportState, - bool IntroducerIsFirstPPToken); + bool SeenNoTrivialPPDirective); /// The parser has processed a global-module-fragment declaration that begins /// the definition of the global module fragment of the current module unit. diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 8e651cf06062..f2ce69a62838 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -60,16 +60,18 @@ template class OptionScope final { public: /// Root constructor, compiling or discarding primitives. OptionScope(Compiler *Ctx, bool NewDiscardResult, - bool NewInitializing) + bool NewInitializing, bool NewToLValue) : Ctx(Ctx), OldDiscardResult(Ctx->DiscardResult), - OldInitializing(Ctx->Initializing) { + OldInitializing(Ctx->Initializing), OldToLValue(NewToLValue) { Ctx->DiscardResult = NewDiscardResult; Ctx->Initializing = NewInitializing; + Ctx->ToLValue = NewToLValue; } ~OptionScope() { Ctx->DiscardResult = OldDiscardResult; Ctx->Initializing = OldInitializing; + Ctx->ToLValue = OldToLValue; } private: @@ -78,6 +80,7 @@ private: /// Old discard flag to restore. bool OldDiscardResult; bool OldInitializing; + bool OldToLValue; }; template @@ -222,6 +225,9 @@ bool Compiler::VisitCastExpr(const CastExpr *CE) { switch (CE->getCastKind()) { case CK_LValueToRValue: { + if (ToLValue && CE->getType()->isPointerType()) + return this->delegate(SubExpr); + if (SubExpr->getType().isVolatileQualified()) return this->emitInvalidCast(CastKind::Volatile, /*Fatal=*/true, CE); @@ -2974,20 +2980,25 @@ bool Compiler::VisitCompoundLiteralExpr(const CompoundLiteralExpr *E) { if (T && !E->isLValue()) return this->delegate(Init); - if (std::optional GlobalIndex = P.createGlobal(E)) { - if (!this->emitGetPtrGlobal(*GlobalIndex, E)) + std::optional GlobalIndex = P.createGlobal(E); + if (!GlobalIndex) + return false; + + if (!this->emitGetPtrGlobal(*GlobalIndex, E)) + return false; + + // Since this is a global variable, we might've already seen, + // don't do it again. + if (P.isGlobalInitialized(*GlobalIndex)) + return true; + + if (T) { + if (!this->visit(Init)) return false; - - if (T) { - if (!this->visit(Init)) - return false; - return this->emitInitGlobal(*T, *GlobalIndex, E); - } - - return this->visitInitializer(Init) && this->emitFinishInit(E); + return this->emitInitGlobal(*T, *GlobalIndex, E); } - return false; + return this->visitInitializer(Init) && this->emitFinishInit(E); } // Otherwise, use a local variable. @@ -4140,13 +4151,13 @@ bool Compiler::VisitStmtExpr(const StmtExpr *E) { template bool Compiler::discard(const Expr *E) { OptionScope Scope(this, /*NewDiscardResult=*/true, - /*NewInitializing=*/false); + /*NewInitializing=*/false, /*ToLValue=*/false); return this->Visit(E); } template bool Compiler::delegate(const Expr *E) { // We're basically doing: - // OptionScope Scope(this, DicardResult, Initializing); + // OptionScope Scope(this, DicardResult, Initializing, ToLValue); // but that's unnecessary of course. return this->Visit(E); } @@ -4174,7 +4185,7 @@ template bool Compiler::visit(const Expr *E) { // Otherwise,we have a primitive return value, produce the value directly // and push it on the stack. OptionScope Scope(this, /*NewDiscardResult=*/false, - /*NewInitializing=*/false); + /*NewInitializing=*/false, /*ToLValue=*/ToLValue); return this->Visit(E); } @@ -4183,7 +4194,13 @@ bool Compiler::visitInitializer(const Expr *E) { assert(!canClassify(E->getType())); OptionScope Scope(this, /*NewDiscardResult=*/false, - /*NewInitializing=*/true); + /*NewInitializing=*/true, /*ToLValue=*/false); + return this->Visit(E); +} + +template bool Compiler::visitAsLValue(const Expr *E) { + OptionScope Scope(this, /*NewDiscardResult=*/false, + /*NewInitializing=*/false, /*ToLValue=*/true); return this->Visit(E); } @@ -4944,7 +4961,6 @@ bool Compiler::visitAPValueInitializer(const APValue &Val, template bool Compiler::VisitBuiltinCallExpr(const CallExpr *E, unsigned BuiltinID) { - if (BuiltinID == Builtin::BI__builtin_constant_p) { // Void argument is always invalid and harder to handle later. if (E->getArg(0)->getType()->isVoidType()) { @@ -4989,11 +5005,31 @@ bool Compiler::VisitBuiltinCallExpr(const CallExpr *E, return false; } - if (!Context::isUnevaluatedBuiltin(BuiltinID)) { - // Put arguments on the stack. - for (const auto *Arg : E->arguments()) { - if (!this->visit(Arg)) + // Prepare function arguments including special cases. + switch (BuiltinID) { + case Builtin::BI__builtin_object_size: + case Builtin::BI__builtin_dynamic_object_size: { + assert(E->getNumArgs() == 2); + const Expr *Arg0 = E->getArg(0); + if (Arg0->isGLValue()) { + if (!this->visit(Arg0)) return false; + + } else { + if (!this->visitAsLValue(Arg0)) + return false; + } + if (!this->visit(E->getArg(1))) + return false; + + } break; + default: + if (!Context::isUnevaluatedBuiltin(BuiltinID)) { + // Put arguments on the stack. + for (const auto *Arg : E->arguments()) { + if (!this->visit(Arg)) + return false; + } } } @@ -5146,7 +5182,8 @@ bool Compiler::VisitCallExpr(const CallExpr *E) { if (!this->emitCheckPseudoDtor(E)) return false; const Expr *Base = PD->getBase(); - if (!Base->isGLValue()) + // E.g. `using T = int; 0.~T();`. + if (OptPrimType BaseT = classify(Base); !BaseT || BaseT != PT_Ptr) return this->discard(Base); if (!this->visit(Base)) return false; @@ -6745,6 +6782,22 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { // value. bool IsReference = D->getType()->isReferenceType(); + // Function parameters. + // Note that it's important to check them first since we might have a local + // variable created for a ParmVarDecl as well. + if (const auto *PVD = dyn_cast(D)) { + if (Ctx.getLangOpts().CPlusPlus && !Ctx.getLangOpts().CPlusPlus11 && + !D->getType()->isIntegralOrEnumerationType()) { + return this->emitInvalidDeclRef(cast(E), + /*InitializerFailed=*/false, E); + } + if (auto It = this->Params.find(PVD); It != this->Params.end()) { + if (IsReference || !It->second.IsPtr) + return this->emitGetParam(classifyPrim(E), It->second.Offset, E); + + return this->emitGetPtrParam(It->second.Offset, E); + } + } // Local variables. if (auto It = Locals.find(D); It != Locals.end()) { const unsigned Offset = It->second.Offset; @@ -6762,20 +6815,6 @@ bool Compiler::visitDeclRef(const ValueDecl *D, const Expr *E) { return this->emitGetPtrGlobal(*GlobalIndex, E); } - // Function parameters. - if (const auto *PVD = dyn_cast(D)) { - if (Ctx.getLangOpts().CPlusPlus && !Ctx.getLangOpts().CPlusPlus11 && - !D->getType()->isIntegralOrEnumerationType()) { - return this->emitInvalidDeclRef(cast(E), - /*InitializerFailed=*/false, E); - } - if (auto It = this->Params.find(PVD); It != this->Params.end()) { - if (IsReference || !It->second.IsPtr) - return this->emitGetParam(classifyPrim(E), It->second.Offset, E); - - return this->emitGetPtrParam(It->second.Offset, E); - } - } // In case we need to re-visit a declaration. auto revisit = [&](const VarDecl *VD) -> bool { diff --git a/clang/lib/AST/ByteCode/Compiler.h b/clang/lib/AST/ByteCode/Compiler.h index 901934e530ad..20571df0432f 100644 --- a/clang/lib/AST/ByteCode/Compiler.h +++ b/clang/lib/AST/ByteCode/Compiler.h @@ -282,6 +282,7 @@ protected: /// been created. visitInitializer() then relies on a pointer to this /// variable being on top of the stack. bool visitInitializer(const Expr *E); + bool visitAsLValue(const Expr *E); /// Evaluates an expression for side effects and discards the result. bool discard(const Expr *E); /// Just pass evaluation on to \p E. This leaves all the parsing flags @@ -426,6 +427,7 @@ protected: bool DiscardResult = false; bool InStmtExpr = false; + bool ToLValue = false; /// Flag inidicating if we're initializing an already created /// variable. This is set in visitInitializer(). diff --git a/clang/lib/AST/ByteCode/Context.cpp b/clang/lib/AST/ByteCode/Context.cpp index 6343b2af313f..36eb7607e70b 100644 --- a/clang/lib/AST/ByteCode/Context.cpp +++ b/clang/lib/AST/ByteCode/Context.cpp @@ -398,17 +398,11 @@ const llvm::fltSemantics &Context::getFloatSemantics(QualType T) const { } bool Context::Run(State &Parent, const Function *Func) { - - { - InterpState State(Parent, *P, Stk, *this, Func); - if (Interpret(State)) { - assert(Stk.empty()); - return true; - } - // State gets destroyed here, so the Stk.clear() below doesn't accidentally - // remove values the State's destructor might access. + InterpState State(Parent, *P, Stk, *this, Func); + if (Interpret(State)) { + assert(Stk.empty()); + return true; } - Stk.clear(); return false; } diff --git a/clang/lib/AST/ByteCode/Context.h b/clang/lib/AST/ByteCode/Context.h index a6d90bb38506..fa98498dbe8f 100644 --- a/clang/lib/AST/ByteCode/Context.h +++ b/clang/lib/AST/ByteCode/Context.h @@ -30,7 +30,7 @@ namespace interp { class Function; class Program; class State; -enum PrimType : unsigned; +enum PrimType : uint8_t; struct ParamOffset { unsigned Offset; diff --git a/clang/lib/AST/ByteCode/Descriptor.h b/clang/lib/AST/ByteCode/Descriptor.h index 4a808c0a2d21..90dc2b4aa311 100644 --- a/clang/lib/AST/ByteCode/Descriptor.h +++ b/clang/lib/AST/ByteCode/Descriptor.h @@ -24,7 +24,7 @@ class Record; class SourceInfo; struct InitMap; struct Descriptor; -enum PrimType : unsigned; +enum PrimType : uint8_t; using DeclTy = llvm::PointerUnion; using InitMapPtr = std::optional>>; diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h index 92363b62c85d..af429b7849e8 100644 --- a/clang/lib/AST/ByteCode/Function.h +++ b/clang/lib/AST/ByteCode/Function.h @@ -28,7 +28,7 @@ namespace interp { class Program; class ByteCodeEmitter; class Pointer; -enum PrimType : uint32_t; +enum PrimType : uint8_t; /// Describes a scope block. /// diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp index 931d3879f0ff..aeab9ff38171 100644 --- a/clang/lib/AST/ByteCode/Interp.cpp +++ b/clang/lib/AST/ByteCode/Interp.cpp @@ -1852,6 +1852,11 @@ bool EndLifetime(InterpState &S, CodePtr OpPC) { const auto &Ptr = S.Stk.peek(); if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy)) return false; + + // FIXME: We need per-element lifetime information for primitive arrays. + if (Ptr.isArrayElement()) + return true; + endLifetimeRecurse(Ptr.narrow()); return true; } @@ -1861,6 +1866,11 @@ bool EndLifetimePop(InterpState &S, CodePtr OpPC) { const auto &Ptr = S.Stk.pop(); if (Ptr.isBlockPointer() && !CheckDummy(S, OpPC, Ptr.block(), AK_Destroy)) return false; + + // FIXME: We need per-element lifetime information for primitive arrays. + if (Ptr.isArrayElement()) + return true; + endLifetimeRecurse(Ptr.narrow()); return true; } diff --git a/clang/lib/AST/ByteCode/InterpBlock.cpp b/clang/lib/AST/ByteCode/InterpBlock.cpp index 69221d85d671..b7fd324594c8 100644 --- a/clang/lib/AST/ByteCode/InterpBlock.cpp +++ b/clang/lib/AST/ByteCode/InterpBlock.cpp @@ -18,10 +18,6 @@ using namespace clang::interp; void Block::addPointer(Pointer *P) { assert(P); - if (IsStatic) { - assert(!Pointers); - return; - } #ifndef NDEBUG assert(!hasPointer(P)); @@ -39,10 +35,6 @@ void Block::addPointer(Pointer *P) { void Block::removePointer(Pointer *P) { assert(P->isBlockPointer()); assert(P); - if (IsStatic) { - assert(!Pointers); - return; - } #ifndef NDEBUG assert(hasPointer(P)); @@ -74,10 +66,6 @@ void Block::replacePointer(Pointer *Old, Pointer *New) { assert(New); assert(New->isBlockPointer()); assert(Old != New); - if (IsStatic) { - assert(!Pointers); - return; - } #ifndef NDEBUG assert(hasPointer(Old)); #endif diff --git a/clang/lib/AST/ByteCode/InterpBlock.h b/clang/lib/AST/ByteCode/InterpBlock.h index 7ded1e8649fd..778ac8fdb085 100644 --- a/clang/lib/AST/ByteCode/InterpBlock.h +++ b/clang/lib/AST/ByteCode/InterpBlock.h @@ -22,7 +22,7 @@ class Block; class DeadBlock; class InterpState; class Pointer; -enum PrimType : unsigned; +enum PrimType : uint8_t; /// A memory block, either on the stack or in the heap. /// @@ -50,9 +50,9 @@ private: public: /// Creates a new block. - Block(unsigned EvalID, const std::optional &DeclID, - const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false, - bool IsWeak = false, bool IsDummy = false) + Block(unsigned EvalID, UnsignedOrNone DeclID, const Descriptor *Desc, + bool IsStatic = false, bool IsExtern = false, bool IsWeak = false, + bool IsDummy = false) : Desc(Desc), DeclID(DeclID), EvalID(EvalID), IsStatic(IsStatic) { assert(Desc); AccessFlags |= (ExternFlag * IsExtern); @@ -62,8 +62,7 @@ public: Block(unsigned EvalID, const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false, bool IsWeak = false, bool IsDummy = false) - : Desc(Desc), DeclID((unsigned)-1), EvalID(EvalID), IsStatic(IsStatic), - IsDynamic(false) { + : Desc(Desc), EvalID(EvalID), IsStatic(IsStatic), IsDynamic(false) { assert(Desc); AccessFlags |= (ExternFlag * IsExtern); AccessFlags |= (WeakFlag * IsWeak); @@ -87,7 +86,7 @@ public: /// Returns the size of the block. unsigned getSize() const { return Desc->getAllocSize(); } /// Returns the declaration ID. - std::optional getDeclID() const { return DeclID; } + UnsignedOrNone getDeclID() const { return DeclID; } /// Returns whether the data of this block has been initialized via /// invoking the Ctor func. bool isInitialized() const { return IsInitialized; } @@ -177,7 +176,7 @@ private: /// Start of the chain of pointers. Pointer *Pointers = nullptr; /// Unique identifier of the declaration. - std::optional DeclID; + UnsignedOrNone DeclID = std::nullopt; const unsigned EvalID = ~0u; /// Flag indicating if the block has static storage duration. bool IsStatic = false; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index d5dcb1f58119..0d49059460c2 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2170,29 +2170,32 @@ static bool interp__builtin_memchr(InterpState &S, CodePtr OpPC, return true; } -static unsigned computeFullDescSize(const ASTContext &ASTCtx, - const Descriptor *Desc) { - +static std::optional computeFullDescSize(const ASTContext &ASTCtx, + const Descriptor *Desc) { if (Desc->isPrimitive()) return ASTCtx.getTypeSizeInChars(Desc->getType()).getQuantity(); - if (Desc->isArray()) return ASTCtx.getTypeSizeInChars(Desc->getElemQualType()).getQuantity() * Desc->getNumElems(); + if (Desc->isRecord()) { + // Can't use Descriptor::getType() as that may return a pointer type. Look + // at the decl directly. + return ASTCtx + .getTypeSizeInChars( + ASTCtx.getCanonicalTagType(Desc->ElemRecord->getDecl())) + .getQuantity(); + } - if (Desc->isRecord()) - return ASTCtx.getTypeSizeInChars(Desc->getType()).getQuantity(); - - llvm_unreachable("Unhandled descriptor type"); - return 0; + return std::nullopt; } +/// Compute the byte offset of \p Ptr in the full declaration. static unsigned computePointerOffset(const ASTContext &ASTCtx, const Pointer &Ptr) { unsigned Result = 0; Pointer P = Ptr; - while (P.isArrayElement() || P.isField()) { + while (P.isField() || P.isArrayElement()) { P = P.expand(); const Descriptor *D = P.getFieldDesc(); @@ -2205,7 +2208,6 @@ static unsigned computePointerOffset(const ASTContext &ASTCtx, Result += ElemSize * P.getIndex(); P = P.expand().getArray(); } else if (P.isBaseClass()) { - const auto *RD = cast(D->asDecl()); bool IsVirtual = Ptr.isVirtualBaseClass(); P = P.getBase(); @@ -2234,30 +2236,136 @@ static unsigned computePointerOffset(const ASTContext &ASTCtx, return Result; } +/// Does Ptr point to the last subobject? +static bool pointsToLastObject(const Pointer &Ptr) { + Pointer P = Ptr; + while (!P.isRoot()) { + + if (P.isArrayElement()) { + P = P.expand().getArray(); + continue; + } + if (P.isBaseClass()) { + if (P.getRecord()->getNumFields() > 0) + return false; + P = P.getBase(); + continue; + } + + Pointer Base = P.getBase(); + if (const Record *R = Base.getRecord()) { + assert(P.getField()); + if (P.getField()->getFieldIndex() != R->getNumFields() - 1) + return false; + } + P = Base; + } + + return true; +} + +/// Does Ptr point to the last object AND to a flexible array member? +static bool isUserWritingOffTheEnd(const ASTContext &Ctx, const Pointer &Ptr) { + auto isFlexibleArrayMember = [&](const Descriptor *FieldDesc) { + using FAMKind = LangOptions::StrictFlexArraysLevelKind; + FAMKind StrictFlexArraysLevel = + Ctx.getLangOpts().getStrictFlexArraysLevel(); + + if (StrictFlexArraysLevel == FAMKind::Default) + return true; + + unsigned NumElems = FieldDesc->getNumElems(); + if (NumElems == 0 && StrictFlexArraysLevel != FAMKind::IncompleteOnly) + return true; + + if (NumElems == 1 && StrictFlexArraysLevel == FAMKind::OneZeroOrIncomplete) + return true; + return false; + }; + + const Descriptor *FieldDesc = Ptr.getFieldDesc(); + if (!FieldDesc->isArray()) + return false; + + return Ptr.isDummy() && pointsToLastObject(Ptr) && + isFlexibleArrayMember(FieldDesc); +} + static bool interp__builtin_object_size(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { + const ASTContext &ASTCtx = S.getASTContext(); PrimType KindT = *S.getContext().classify(Call->getArg(1)); - [[maybe_unused]] unsigned Kind = popToAPSInt(S.Stk, KindT).getZExtValue(); - + // From the GCC docs: + // Kind is an integer constant from 0 to 3. If the least significant bit is + // clear, objects are whole variables. If it is set, a closest surrounding + // subobject is considered the object a pointer points to. The second bit + // determines if maximum or minimum of remaining bytes is computed. + unsigned Kind = popToAPSInt(S.Stk, KindT).getZExtValue(); assert(Kind <= 3 && "unexpected kind"); - + bool UseFieldDesc = (Kind & 1u); + bool ReportMinimum = (Kind & 2u); const Pointer &Ptr = S.Stk.pop(); - if (Ptr.isZero()) + if (Call->getArg(0)->HasSideEffects(ASTCtx)) { + // "If there are any side effects in them, it returns (size_t) -1 + // for type 0 or 1 and (size_t) 0 for type 2 or 3." + pushInteger(S, Kind <= 1 ? -1 : 0, Call->getType()); + return true; + } + + if (Ptr.isZero() || !Ptr.isBlockPointer()) return false; + // We can't load through pointers. + if (Ptr.isDummy() && Ptr.getType()->isPointerType()) + return false; + + bool DetermineForCompleteObject = Ptr.getFieldDesc() == Ptr.getDeclDesc(); const Descriptor *DeclDesc = Ptr.getDeclDesc(); - if (!DeclDesc) + assert(DeclDesc); + + if (!UseFieldDesc || DetermineForCompleteObject) { + // Lower bound, so we can't fall back to this. + if (ReportMinimum && !DetermineForCompleteObject) + return false; + + // Can't read beyond the pointer decl desc. + if (!UseFieldDesc && !ReportMinimum && DeclDesc->getType()->isPointerType()) + return false; + } else { + if (isUserWritingOffTheEnd(ASTCtx, Ptr.expand())) { + // If we cannot determine the size of the initial allocation, then we + // can't given an accurate upper-bound. However, we are still able to give + // conservative lower-bounds for Type=3. + if (Kind == 1) + return false; + } + } + + const Descriptor *Desc = UseFieldDesc ? Ptr.getFieldDesc() : DeclDesc; + assert(Desc); + + std::optional FullSize = computeFullDescSize(ASTCtx, Desc); + if (!FullSize) return false; - const ASTContext &ASTCtx = S.getASTContext(); + unsigned ByteOffset; + if (UseFieldDesc) { + if (Ptr.isBaseClass()) + ByteOffset = computePointerOffset(ASTCtx, Ptr.getBase()) - + computePointerOffset(ASTCtx, Ptr); + else + ByteOffset = + computePointerOffset(ASTCtx, Ptr) - + computePointerOffset(ASTCtx, Ptr.expand().atIndex(0).narrow()); + } else + ByteOffset = computePointerOffset(ASTCtx, Ptr); - unsigned ByteOffset = computePointerOffset(ASTCtx, Ptr); - unsigned FullSize = computeFullDescSize(ASTCtx, DeclDesc); - - pushInteger(S, FullSize - ByteOffset, Call->getType()); + assert(ByteOffset <= *FullSize); + unsigned Result = *FullSize - ByteOffset; + pushInteger(S, Result, Call->getType()); return true; } diff --git a/clang/lib/AST/ByteCode/InterpStack.cpp b/clang/lib/AST/ByteCode/InterpStack.cpp index 6b748d62b83b..7920378f365f 100644 --- a/clang/lib/AST/ByteCode/InterpStack.cpp +++ b/clang/lib/AST/ByteCode/InterpStack.cpp @@ -26,33 +26,33 @@ InterpStack::~InterpStack() { std::free(Chunk); Chunk = nullptr; StackSize = 0; -#ifndef NDEBUG ItemTypes.clear(); -#endif } // We keep the last chunk around to reuse. void InterpStack::clear() { - if (!Chunk) - return; - - if (Chunk->Next) - std::free(Chunk->Next); - - assert(Chunk); - StackSize = 0; -#ifndef NDEBUG - ItemTypes.clear(); -#endif + for (PrimType Item : llvm::reverse(ItemTypes)) { + TYPE_SWITCH(Item, { this->discard(); }); + } + assert(ItemTypes.empty()); + assert(empty()); } void InterpStack::clearTo(size_t NewSize) { - assert(NewSize <= size()); - size_t ToShrink = size() - NewSize; - if (ToShrink == 0) + if (NewSize == 0) + return clear(); + if (NewSize == size()) return; - shrink(ToShrink); + assert(NewSize <= size()); + for (PrimType Item : llvm::reverse(ItemTypes)) { + TYPE_SWITCH(Item, { this->discard(); }); + + if (size() == NewSize) + break; + } + + // Note: discard() above already removed the types from ItemTypes. assert(size() == NewSize); } @@ -105,25 +105,9 @@ void InterpStack::shrink(size_t Size) { Chunk->End -= Size; StackSize -= Size; - -#ifndef NDEBUG - size_t TypesSize = 0; - for (PrimType T : ItemTypes) - TYPE_SWITCH(T, { TypesSize += aligned_size(); }); - - size_t StackSize = size(); - while (TypesSize > StackSize) { - TYPE_SWITCH(ItemTypes.back(), { - TypesSize -= aligned_size(); - ItemTypes.pop_back(); - }); - } - assert(TypesSize == StackSize); -#endif } void InterpStack::dump() const { -#ifndef NDEBUG llvm::errs() << "Items: " << ItemTypes.size() << ". Size: " << size() << '\n'; if (ItemTypes.empty()) return; @@ -133,11 +117,11 @@ void InterpStack::dump() const { // The type of the item on the top of the stack is inserted to the back // of the vector, so the iteration has to happen backwards. - for (auto TyIt = ItemTypes.rbegin(); TyIt != ItemTypes.rend(); ++TyIt) { - Offset += align(primSize(*TyIt)); + for (PrimType Item : llvm::reverse(ItemTypes)) { + Offset += align(primSize(Item)); llvm::errs() << Index << '/' << Offset << ": "; - TYPE_SWITCH(*TyIt, { + TYPE_SWITCH(Item, { const T &V = peek(Offset); llvm::errs() << V; }); @@ -145,5 +129,4 @@ void InterpStack::dump() const { ++Index; } -#endif } diff --git a/clang/lib/AST/ByteCode/InterpStack.h b/clang/lib/AST/ByteCode/InterpStack.h index 580494eb2347..b0f9f6e22568 100644 --- a/clang/lib/AST/ByteCode/InterpStack.h +++ b/clang/lib/AST/ByteCode/InterpStack.h @@ -17,7 +17,6 @@ #include "IntegralAP.h" #include "MemberPointer.h" #include "PrimType.h" -#include namespace clang { namespace interp { @@ -33,18 +32,14 @@ public: /// Constructs a value in place on the top of the stack. template void push(Tys &&...Args) { new (grow(aligned_size())) T(std::forward(Args)...); -#ifndef NDEBUG ItemTypes.push_back(toPrimType()); -#endif } /// Returns the value from the top of the stack and removes it. template T pop() { -#ifndef NDEBUG assert(!ItemTypes.empty()); assert(ItemTypes.back() == toPrimType()); ItemTypes.pop_back(); -#endif T *Ptr = &peekInternal(); T Value = std::move(*Ptr); shrink(aligned_size()); @@ -53,22 +48,20 @@ public: /// Discards the top value from the stack. template void discard() { -#ifndef NDEBUG assert(!ItemTypes.empty()); assert(ItemTypes.back() == toPrimType()); ItemTypes.pop_back(); -#endif T *Ptr = &peekInternal(); - Ptr->~T(); + if constexpr (!std::is_trivially_destructible_v) { + Ptr->~T(); + } shrink(aligned_size()); } /// Returns a reference to the value on the top of the stack. template T &peek() const { -#ifndef NDEBUG assert(!ItemTypes.empty()); assert(ItemTypes.back() == toPrimType()); -#endif return peekInternal(); } @@ -83,7 +76,7 @@ public: /// Returns the size of the stack in bytes. size_t size() const { return StackSize; } - /// Clears the stack without calling any destructors. + /// Clears the stack. void clear(); void clearTo(size_t NewSize); @@ -146,9 +139,11 @@ private: /// Total size of the stack. size_t StackSize = 0; -#ifndef NDEBUG - /// vector recording the type of data we pushed into the stack. - std::vector ItemTypes; + /// SmallVector recording the type of data we pushed into the stack. + /// We don't usually need this during normal code interpretation but + /// when aborting, we need type information to call the destructors + /// for what's left on the stack. + llvm::SmallVector ItemTypes; template static constexpr PrimType toPrimType() { if constexpr (std::is_same_v) @@ -192,7 +187,6 @@ private: llvm_unreachable("unknown type push()'ed into InterpStack"); } -#endif }; } // namespace interp diff --git a/clang/lib/AST/ByteCode/InterpState.cpp b/clang/lib/AST/ByteCode/InterpState.cpp index b5f0f9a44f34..f89967759ff9 100644 --- a/clang/lib/AST/ByteCode/InterpState.cpp +++ b/clang/lib/AST/ByteCode/InterpState.cpp @@ -45,6 +45,12 @@ InterpState::~InterpState() { while (DeadBlocks) { DeadBlock *Next = DeadBlocks->Next; + + // There might be a pointer in a global structure pointing to the dead + // block. + for (Pointer *P = DeadBlocks->B.Pointers; P; P = P->asBlockPointer().Next) + DeadBlocks->B.removePointer(P); + std::free(DeadBlocks); DeadBlocks = Next; } @@ -53,12 +59,6 @@ InterpState::~InterpState() { void InterpState::cleanup() { // As a last resort, make sure all pointers still pointing to a dead block // don't point to it anymore. - for (DeadBlock *DB = DeadBlocks; DB; DB = DB->Next) { - for (Pointer *P = DB->B.Pointers; P; P = P->asBlockPointer().Next) { - P->PointeeStorage.BS.Pointee = nullptr; - } - } - Alloc.cleanup(); } diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h index 94c83a0d87bc..1dcdc0424801 100644 --- a/clang/lib/AST/ByteCode/Pointer.h +++ b/clang/lib/AST/ByteCode/Pointer.h @@ -29,7 +29,7 @@ class DeadBlock; class Pointer; class Context; template class Integral; -enum PrimType : unsigned; +enum PrimType : uint8_t; class Pointer; inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Pointer &P); @@ -593,7 +593,7 @@ public: } /// Returns the declaration ID. - std::optional getDeclID() const { + UnsignedOrNone getDeclID() const { if (isBlockPointer()) { assert(asBlockPointer().Pointee); return asBlockPointer().Pointee->getDeclID(); diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h index 724da93ca1ef..093084a8aad7 100644 --- a/clang/lib/AST/ByteCode/PrimType.h +++ b/clang/lib/AST/ByteCode/PrimType.h @@ -31,7 +31,7 @@ template class IntegralAP; template class Integral; /// Enumeration of the primitive types of the VM. -enum PrimType : unsigned { +enum PrimType : uint8_t { PT_Sint8 = 0, PT_Uint8 = 1, PT_Sint16 = 2, @@ -51,14 +51,15 @@ enum PrimType : unsigned { // Like std::optional, but only sizeof(PrimType). class OptPrimType final { - unsigned V = ~0u; + static constexpr uint8_t None = 0xFF; + uint8_t V = None; public: OptPrimType() = default; OptPrimType(std::nullopt_t) {} OptPrimType(PrimType T) : V(static_cast(T)) {} - explicit constexpr operator bool() const { return V != ~0u; } + explicit constexpr operator bool() const { return V != None; } PrimType operator*() const { assert(operator bool()); return static_cast(V); diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index 2843b325fe02..749ae2510612 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -164,8 +164,8 @@ unsigned Program::getOrCreateDummy(const DeclTy &D) { const auto *VD = cast(cast(D)); IsWeak = VD->isWeak(); QT = VD->getType(); - if (const auto *RT = QT->getAs()) - QT = RT->getPointeeType(); + if (QT->isPointerOrReferenceType()) + QT = QT->getPointeeType(); } assert(!QT.isNull()); diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index 207ceef91da4..9c4e63a14d44 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -73,6 +73,10 @@ public: return Globals[Idx]->block(); } + bool isGlobalInitialized(unsigned Index) const { + return getPtrGlobal(Index).isInitialized(); + } + /// Finds a global's index. std::optional getGlobal(const ValueDecl *VD); std::optional getGlobal(const Expr *E); @@ -152,7 +156,7 @@ public: }; /// Returns the current declaration ID. - std::optional getCurrentDecl() const { + UnsignedOrNone getCurrentDecl() const { if (CurrentDeclaration == NoDeclaration) return std::nullopt; return CurrentDeclaration; diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 7cac655ef151..e14cff552c92 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -2805,32 +2805,20 @@ bool Expr::isUnusedResultAWarning(const Expr *&WarnE, SourceLocation &Loc, case CXXTemporaryObjectExprClass: case CXXConstructExprClass: { - if (const CXXRecordDecl *Type = getType()->getAsCXXRecordDecl()) { - const auto *WarnURAttr = Type->getAttr(); - if (Type->hasAttr() || - (WarnURAttr && WarnURAttr->IsCXX11NoDiscard())) { - WarnE = this; - Loc = getBeginLoc(); - R1 = getSourceRange(); - return true; - } - } - const auto *CE = cast(this); - if (const CXXConstructorDecl *Ctor = CE->getConstructor()) { - const auto *WarnURAttr = Ctor->getAttr(); - if (WarnURAttr && WarnURAttr->IsCXX11NoDiscard()) { - WarnE = this; - Loc = getBeginLoc(); - R1 = getSourceRange(); + const CXXRecordDecl *Type = getType()->getAsCXXRecordDecl(); - if (unsigned NumArgs = CE->getNumArgs()) - R2 = SourceRange(CE->getArg(0)->getBeginLoc(), - CE->getArg(NumArgs - 1)->getEndLoc()); - return true; - } + if ((Type && Type->hasAttr()) || + CE->hasUnusedResultAttr(Ctx)) { + WarnE = this; + Loc = getBeginLoc(); + R1 = getSourceRange(); + + if (unsigned NumArgs = CE->getNumArgs()) + R2 = SourceRange(CE->getArg(0)->getBeginLoc(), + CE->getArg(NumArgs - 1)->getEndLoc()); + return true; } - return false; } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 7d4542288921..40c56501b0c1 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11711,6 +11711,43 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case clang::X86::BI__builtin_ia32_pmuldq128: + case clang::X86::BI__builtin_ia32_pmuldq256: + case clang::X86::BI__builtin_ia32_pmuldq512: + case clang::X86::BI__builtin_ia32_pmuludq128: + case clang::X86::BI__builtin_ia32_pmuludq256: + case clang::X86::BI__builtin_ia32_pmuludq512: { + APValue SourceLHS, SourceRHS; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) + return false; + + unsigned SourceLen = SourceLHS.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(SourceLen / 2); + + for (unsigned EltNum = 0; EltNum < SourceLen; EltNum += 2) { + APSInt LHS = SourceLHS.getVectorElt(EltNum).getInt(); + APSInt RHS = SourceRHS.getVectorElt(EltNum).getInt(); + + switch (E->getBuiltinCallee()) { + case clang::X86::BI__builtin_ia32_pmuludq128: + case clang::X86::BI__builtin_ia32_pmuludq256: + case clang::X86::BI__builtin_ia32_pmuludq512: + ResultElements.push_back( + APValue(APSInt(llvm::APIntOps::muluExtended(LHS, RHS), true))); + break; + case clang::X86::BI__builtin_ia32_pmuldq128: + case clang::X86::BI__builtin_ia32_pmuldq256: + case clang::X86::BI__builtin_ia32_pmuldq512: + ResultElements.push_back( + APValue(APSInt(llvm::APIntOps::mulsExtended(LHS, RHS), false))); + break; + } + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } case Builtin::BI__builtin_elementwise_max: case Builtin::BI__builtin_elementwise_min: { APValue SourceLHS, SourceRHS; @@ -11746,6 +11783,50 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_selectb_128: + case X86::BI__builtin_ia32_selectb_256: + case X86::BI__builtin_ia32_selectb_512: + case X86::BI__builtin_ia32_selectw_128: + case X86::BI__builtin_ia32_selectw_256: + case X86::BI__builtin_ia32_selectw_512: + case X86::BI__builtin_ia32_selectd_128: + case X86::BI__builtin_ia32_selectd_256: + case X86::BI__builtin_ia32_selectd_512: + case X86::BI__builtin_ia32_selectq_128: + case X86::BI__builtin_ia32_selectq_256: + case X86::BI__builtin_ia32_selectq_512: + case X86::BI__builtin_ia32_selectph_128: + case X86::BI__builtin_ia32_selectph_256: + case X86::BI__builtin_ia32_selectph_512: + case X86::BI__builtin_ia32_selectpbf_128: + case X86::BI__builtin_ia32_selectpbf_256: + case X86::BI__builtin_ia32_selectpbf_512: + case X86::BI__builtin_ia32_selectps_128: + case X86::BI__builtin_ia32_selectps_256: + case X86::BI__builtin_ia32_selectps_512: + case X86::BI__builtin_ia32_selectpd_128: + case X86::BI__builtin_ia32_selectpd_256: + case X86::BI__builtin_ia32_selectpd_512: { + // AVX512 predicated move: "Result = Mask[] ? LHS[] : RHS[]". + APValue SourceMask, SourceLHS, SourceRHS; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceMask) || + !EvaluateAsRValue(Info, E->getArg(1), SourceLHS) || + !EvaluateAsRValue(Info, E->getArg(2), SourceRHS)) + return false; + + APSInt Mask = SourceMask.getInt(); + unsigned SourceLen = SourceLHS.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(SourceLen); + + for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) { + const APValue &LHS = SourceLHS.getVectorElt(EltNum); + const APValue &RHS = SourceRHS.getVectorElt(EltNum); + ResultElements.push_back(Mask[EltNum] ? LHS : RHS); + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } } } diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 5fee88458527..7998d2369460 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -440,37 +440,37 @@ public: #define GEN_CLANG_CLAUSE_CLASS #define CLAUSE_CLASS(Enum, Str, Class) void Visit##Class(const Class *C); #include "llvm/Frontend/OpenMP/OMP.inc" - void VistOMPClauseWithPreInit(const OMPClauseWithPreInit *C); - void VistOMPClauseWithPostUpdate(const OMPClauseWithPostUpdate *C); + void VisitOMPClauseWithPreInit(const OMPClauseWithPreInit *C); + void VisitOMPClauseWithPostUpdate(const OMPClauseWithPostUpdate *C); }; -void OMPClauseProfiler::VistOMPClauseWithPreInit( +void OMPClauseProfiler::VisitOMPClauseWithPreInit( const OMPClauseWithPreInit *C) { if (auto *S = C->getPreInitStmt()) Profiler->VisitStmt(S); } -void OMPClauseProfiler::VistOMPClauseWithPostUpdate( +void OMPClauseProfiler::VisitOMPClauseWithPostUpdate( const OMPClauseWithPostUpdate *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (auto *E = C->getPostUpdateExpr()) Profiler->VisitStmt(E); } void OMPClauseProfiler::VisitOMPIfClause(const OMPIfClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getCondition()) Profiler->VisitStmt(C->getCondition()); } void OMPClauseProfiler::VisitOMPFinalClause(const OMPFinalClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getCondition()) Profiler->VisitStmt(C->getCondition()); } void OMPClauseProfiler::VisitOMPNumThreadsClause(const OMPNumThreadsClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getNumThreads()) Profiler->VisitStmt(C->getNumThreads()); } @@ -526,13 +526,13 @@ void OMPClauseProfiler::VisitOMPDetachClause(const OMPDetachClause *C) { } void OMPClauseProfiler::VisitOMPNovariantsClause(const OMPNovariantsClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getCondition()) Profiler->VisitStmt(C->getCondition()); } void OMPClauseProfiler::VisitOMPNocontextClause(const OMPNocontextClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getCondition()) Profiler->VisitStmt(C->getCondition()); } @@ -568,7 +568,7 @@ void OMPClauseProfiler::VisitOMPMessageClause(const OMPMessageClause *C) { } void OMPClauseProfiler::VisitOMPScheduleClause(const OMPScheduleClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (auto *S = C->getChunkSize()) Profiler->VisitStmt(S); } @@ -646,7 +646,7 @@ void OMPClauseProfiler::VisitOMPDestroyClause(const OMPDestroyClause *C) { } void OMPClauseProfiler::VisitOMPFilterClause(const OMPFilterClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getThreadID()) Profiler->VisitStmt(C->getThreadID()); } @@ -669,7 +669,7 @@ void OMPClauseProfiler::VisitOMPPrivateClause(const OMPPrivateClause *C) { void OMPClauseProfiler::VisitOMPFirstprivateClause(const OMPFirstprivateClause *C) { VisitOMPClauseList(C); - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); for (auto *E : C->private_copies()) { if (E) Profiler->VisitStmt(E); @@ -682,7 +682,7 @@ OMPClauseProfiler::VisitOMPFirstprivateClause(const OMPFirstprivateClause *C) { void OMPClauseProfiler::VisitOMPLastprivateClause(const OMPLastprivateClause *C) { VisitOMPClauseList(C); - VistOMPClauseWithPostUpdate(C); + VisitOMPClauseWithPostUpdate(C); for (auto *E : C->source_exprs()) { if (E) Profiler->VisitStmt(E); @@ -705,7 +705,7 @@ void OMPClauseProfiler::VisitOMPReductionClause( C->getQualifierLoc().getNestedNameSpecifier()); Profiler->VisitName(C->getNameInfo().getName()); VisitOMPClauseList(C); - VistOMPClauseWithPostUpdate(C); + VisitOMPClauseWithPostUpdate(C); for (auto *E : C->privates()) { if (E) Profiler->VisitStmt(E); @@ -743,7 +743,7 @@ void OMPClauseProfiler::VisitOMPTaskReductionClause( C->getQualifierLoc().getNestedNameSpecifier()); Profiler->VisitName(C->getNameInfo().getName()); VisitOMPClauseList(C); - VistOMPClauseWithPostUpdate(C); + VisitOMPClauseWithPostUpdate(C); for (auto *E : C->privates()) { if (E) Profiler->VisitStmt(E); @@ -767,7 +767,7 @@ void OMPClauseProfiler::VisitOMPInReductionClause( C->getQualifierLoc().getNestedNameSpecifier()); Profiler->VisitName(C->getNameInfo().getName()); VisitOMPClauseList(C); - VistOMPClauseWithPostUpdate(C); + VisitOMPClauseWithPostUpdate(C); for (auto *E : C->privates()) { if (E) Profiler->VisitStmt(E); @@ -791,7 +791,7 @@ void OMPClauseProfiler::VisitOMPInReductionClause( } void OMPClauseProfiler::VisitOMPLinearClause(const OMPLinearClause *C) { VisitOMPClauseList(C); - VistOMPClauseWithPostUpdate(C); + VisitOMPClauseWithPostUpdate(C); for (auto *E : C->privates()) { if (E) Profiler->VisitStmt(E); @@ -873,25 +873,25 @@ void OMPClauseProfiler::VisitOMPAllocateClause(const OMPAllocateClause *C) { } void OMPClauseProfiler::VisitOMPNumTeamsClause(const OMPNumTeamsClause *C) { VisitOMPClauseList(C); - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); } void OMPClauseProfiler::VisitOMPThreadLimitClause( const OMPThreadLimitClause *C) { VisitOMPClauseList(C); - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); } void OMPClauseProfiler::VisitOMPPriorityClause(const OMPPriorityClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getPriority()) Profiler->VisitStmt(C->getPriority()); } void OMPClauseProfiler::VisitOMPGrainsizeClause(const OMPGrainsizeClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getGrainsize()) Profiler->VisitStmt(C->getGrainsize()); } void OMPClauseProfiler::VisitOMPNumTasksClause(const OMPNumTasksClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (C->getNumTasks()) Profiler->VisitStmt(C->getNumTasks()); } @@ -952,7 +952,7 @@ void OMPClauseProfiler::VisitOMPOrderClause(const OMPOrderClause *C) {} void OMPClauseProfiler::VisitOMPBindClause(const OMPBindClause *C) {} void OMPClauseProfiler::VisitOMPXDynCGroupMemClause( const OMPXDynCGroupMemClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (Expr *Size = C->getSize()) Profiler->VisitStmt(Size); } @@ -1229,7 +1229,7 @@ void StmtProfiler::VisitOMPDistributeDirective( void OMPClauseProfiler::VisitOMPDistScheduleClause( const OMPDistScheduleClause *C) { - VistOMPClauseWithPreInit(C); + VisitOMPClauseWithPreInit(C); if (auto *S = C->getChunkSize()) Profiler->VisitStmt(S); } diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index f7949e94d227..5fbf1999ed72 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -1627,7 +1627,7 @@ bool QualType::UseExcessPrecision(const ASTContext &Ctx) { switch (BT->getKind()) { case BuiltinType::Kind::Float16: { const TargetInfo &TI = Ctx.getTargetInfo(); - if (TI.hasFloat16Type() && !TI.hasLegalHalfType() && + if (TI.hasFloat16Type() && !TI.hasFastHalfType() && Ctx.getLangOpts().getFloat16ExcessPrecision() != Ctx.getLangOpts().ExcessPrecisionKind::FPP_None) return true; diff --git a/clang/lib/ASTMatchers/ASTMatchFinder.cpp b/clang/lib/ASTMatchers/ASTMatchFinder.cpp index d43d1aec71b2..e8a0004c2e18 100644 --- a/clang/lib/ASTMatchers/ASTMatchFinder.cpp +++ b/clang/lib/ASTMatchers/ASTMatchFinder.cpp @@ -1344,6 +1344,41 @@ private: return false; } + template static SourceLocation getNodeLocation(const T &Node) { + return Node.getBeginLoc(); + } + + static SourceLocation getNodeLocation(const CXXCtorInitializer &Node) { + return Node.getSourceLocation(); + } + + static SourceLocation getNodeLocation(const TemplateArgumentLoc &Node) { + return Node.getLocation(); + } + + static SourceLocation getNodeLocation(const Attr &Node) { + return Node.getLocation(); + } + + bool isInSystemHeader(SourceLocation Loc) { + const SourceManager &SM = getASTContext().getSourceManager(); + return SM.isInSystemHeader(Loc); + } + + template bool shouldSkipNode(T &Node) { + if (Options.IgnoreSystemHeaders && isInSystemHeader(getNodeLocation(Node))) + return true; + return false; + } + + template bool shouldSkipNode(T *Node) { + return (Node == nullptr) || shouldSkipNode(*Node); + } + + bool shouldSkipNode(QualType &) { return false; } + + bool shouldSkipNode(NestedNameSpecifier &) { return false; } + /// Bucket to record map. /// /// Used to get the appropriate bucket for each matcher. @@ -1473,9 +1508,8 @@ bool MatchASTVisitor::objcClassIsDerivedFrom( } bool MatchASTVisitor::TraverseDecl(Decl *DeclNode) { - if (!DeclNode) { + if (shouldSkipNode(DeclNode)) return true; - } bool ScopedTraversal = TraversingASTNodeNotSpelledInSource || DeclNode->isImplicit(); @@ -1503,9 +1537,9 @@ bool MatchASTVisitor::TraverseDecl(Decl *DeclNode) { } bool MatchASTVisitor::TraverseStmt(Stmt *StmtNode, DataRecursionQueue *Queue) { - if (!StmtNode) { + if (shouldSkipNode(StmtNode)) return true; - } + bool ScopedTraversal = TraversingASTNodeNotSpelledInSource || TraversingASTChildrenNotSpelledInSource; @@ -1515,6 +1549,9 @@ bool MatchASTVisitor::TraverseStmt(Stmt *StmtNode, DataRecursionQueue *Queue) { } bool MatchASTVisitor::TraverseType(QualType TypeNode, bool TraverseQualifier) { + if (shouldSkipNode(TypeNode)) + return true; + match(TypeNode); return RecursiveASTVisitor::TraverseType(TypeNode, TraverseQualifier); @@ -1522,6 +1559,8 @@ bool MatchASTVisitor::TraverseType(QualType TypeNode, bool TraverseQualifier) { bool MatchASTVisitor::TraverseTypeLoc(TypeLoc TypeLocNode, bool TraverseQualifier) { + if (shouldSkipNode(TypeLocNode)) + return true; // The RecursiveASTVisitor only visits types if they're not within TypeLocs. // We still want to find those types via matchers, so we match them here. Note // that the TypeLocs are structurally a shadow-hierarchy to the expressed @@ -1534,6 +1573,9 @@ bool MatchASTVisitor::TraverseTypeLoc(TypeLoc TypeLocNode, } bool MatchASTVisitor::TraverseNestedNameSpecifier(NestedNameSpecifier NNS) { + if (shouldSkipNode(NNS)) + return true; + match(NNS); return RecursiveASTVisitor::TraverseNestedNameSpecifier(NNS); } @@ -1543,6 +1585,9 @@ bool MatchASTVisitor::TraverseNestedNameSpecifierLoc( if (!NNS) return true; + if (shouldSkipNode(NNS)) + return true; + match(NNS); // We only match the nested name specifier here (as opposed to traversing it) @@ -1555,7 +1600,7 @@ bool MatchASTVisitor::TraverseNestedNameSpecifierLoc( bool MatchASTVisitor::TraverseConstructorInitializer( CXXCtorInitializer *CtorInit) { - if (!CtorInit) + if (shouldSkipNode(CtorInit)) return true; bool ScopedTraversal = TraversingASTNodeNotSpelledInSource || @@ -1573,11 +1618,17 @@ bool MatchASTVisitor::TraverseConstructorInitializer( } bool MatchASTVisitor::TraverseTemplateArgumentLoc(TemplateArgumentLoc Loc) { + if (shouldSkipNode(Loc)) + return true; + match(Loc); return RecursiveASTVisitor::TraverseTemplateArgumentLoc(Loc); } bool MatchASTVisitor::TraverseAttr(Attr *AttrNode) { + if (shouldSkipNode(AttrNode)) + return true; + match(*AttrNode); return RecursiveASTVisitor::TraverseAttr(AttrNode); } diff --git a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt index 0c30df8b4b19..97e09c9bce95 100644 --- a/clang/lib/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/lib/Analysis/FlowSensitive/CMakeLists.txt @@ -6,6 +6,7 @@ add_clang_library(clangAnalysisFlowSensitive DataflowAnalysisContext.cpp DataflowEnvironment.cpp Formula.cpp + FormulaSerialization.cpp HTMLLogger.cpp Logger.cpp RecordOps.cpp diff --git a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp index 6421ad3883d1..06a88784a6f9 100644 --- a/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp +++ b/clang/lib/Analysis/FlowSensitive/DataflowAnalysisContext.cpp @@ -208,6 +208,24 @@ bool DataflowAnalysisContext::equivalentFormulas(const Formula &Val1, return isUnsatisfiable(std::move(Constraints)); } +llvm::DenseSet DataflowAnalysisContext::collectDependencies( + llvm::DenseSet Tokens) const { + // Use a worklist algorithm, with `Remaining` holding the worklist and + // `Tokens` tracking which atoms have already been added to the worklist. + std::vector Remaining(Tokens.begin(), Tokens.end()); + while (!Remaining.empty()) { + Atom CurrentToken = Remaining.back(); + Remaining.pop_back(); + if (auto DepsIt = FlowConditionDeps.find(CurrentToken); + DepsIt != FlowConditionDeps.end()) + for (Atom A : DepsIt->second) + if (Tokens.insert(A).second) + Remaining.push_back(A); + } + + return Tokens; +} + void DataflowAnalysisContext::addTransitiveFlowConditionConstraints( Atom Token, llvm::SetVector &Constraints) { llvm::DenseSet AddedTokens; @@ -224,6 +242,8 @@ void DataflowAnalysisContext::addTransitiveFlowConditionConstraints( auto ConstraintsIt = FlowConditionConstraints.find(Token); if (ConstraintsIt == FlowConditionConstraints.end()) { + // The flow condition is unconstrained. Just add the atom directly, which + // is equivalent to asserting it is true. Constraints.insert(&arena().makeAtomRef(Token)); } else { // Bind flow condition token via `iff` to its set of constraints: @@ -239,6 +259,65 @@ void DataflowAnalysisContext::addTransitiveFlowConditionConstraints( } } +static void getReferencedAtoms(const Formula &F, + llvm::DenseSet &Refs) { + switch (F.kind()) { + case Formula::AtomRef: + Refs.insert(F.getAtom()); + break; + case Formula::Literal: + break; + case Formula::Not: + getReferencedAtoms(*F.operands()[0], Refs); + break; + case Formula::And: + case Formula::Or: + case Formula::Implies: + case Formula::Equal: + ArrayRef Operands = F.operands(); + getReferencedAtoms(*Operands[0], Refs); + getReferencedAtoms(*Operands[1], Refs); + break; + } +} + +SimpleLogicalContext DataflowAnalysisContext::exportLogicalContext( + llvm::DenseSet TargetTokens) const { + SimpleLogicalContext LC; + + if (Invariant != nullptr) { + LC.Invariant = Invariant; + getReferencedAtoms(*Invariant, TargetTokens); + } + + llvm::DenseSet Dependencies = + collectDependencies(std::move(TargetTokens)); + + for (dataflow::Atom Token : Dependencies) { + // Only process the token if it is constrained. Unconstrained tokens don't + // have dependencies. + const Formula *Constraints = FlowConditionConstraints.lookup(Token); + if (Constraints == nullptr) + continue; + LC.TokenDefs[Token] = Constraints; + + if (auto DepsIt = FlowConditionDeps.find(Token); + DepsIt != FlowConditionDeps.end()) + LC.TokenDeps[Token] = DepsIt->second; + } + + return LC; +} + +void DataflowAnalysisContext::initLogicalContext(SimpleLogicalContext LC) { + Invariant = LC.Invariant; + FlowConditionConstraints = std::move(LC.TokenDefs); + // TODO: The dependencies in `LC.TokenDeps` can be reconstructed from + // `LC.TokenDefs`. Give the caller the option to reconstruct, rather than + // providing them directly, to save caller space (memory/disk). + FlowConditionDeps = std::move(LC.TokenDeps); +} + static void printAtomList(const llvm::SmallVector &Atoms, llvm::raw_ostream &OS) { OS << "("; diff --git a/clang/lib/Analysis/FlowSensitive/FormulaSerialization.cpp b/clang/lib/Analysis/FlowSensitive/FormulaSerialization.cpp new file mode 100644 index 000000000000..df15a1d6eaad --- /dev/null +++ b/clang/lib/Analysis/FlowSensitive/FormulaSerialization.cpp @@ -0,0 +1,153 @@ +//===- FormulaSerialization.cpp ---------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/FlowSensitive/FormulaSerialization.h" +#include "clang/Analysis/FlowSensitive/Arena.h" +#include "clang/Analysis/FlowSensitive/Formula.h" +#include "clang/Basic/LLVM.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/ErrorHandling.h" +#include + +namespace clang::dataflow { + +// Returns the leading indicator of operation formulas. `AtomRef` and `Literal` +// are handled differently. +static char compactSigil(Formula::Kind K) { + switch (K) { + case Formula::AtomRef: + case Formula::Literal: + // No sigil. + return '\0'; + case Formula::Not: + return '!'; + case Formula::And: + return '&'; + case Formula::Or: + return '|'; + case Formula::Implies: + return '>'; + case Formula::Equal: + return '='; + } + llvm_unreachable("unhandled formula kind"); +} + +void serializeFormula(const Formula &F, llvm::raw_ostream &OS) { + switch (Formula::numOperands(F.kind())) { + case 0: + switch (F.kind()) { + case Formula::AtomRef: + OS << F.getAtom(); + break; + case Formula::Literal: + OS << (F.literal() ? 'T' : 'F'); + break; + default: + llvm_unreachable("unhandled formula kind"); + } + break; + case 1: + OS << compactSigil(F.kind()); + serializeFormula(*F.operands()[0], OS); + break; + case 2: + OS << compactSigil(F.kind()); + serializeFormula(*F.operands()[0], OS); + serializeFormula(*F.operands()[1], OS); + break; + default: + llvm_unreachable("unhandled formula arity"); + } +} + +static llvm::Expected +parsePrefix(llvm::StringRef &Str, Arena &A, + llvm::DenseMap &AtomMap) { + if (Str.empty()) + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "unexpected end of input"); + + char Prefix = Str[0]; + Str = Str.drop_front(); + + switch (Prefix) { + case 'T': + return &A.makeLiteral(true); + case 'F': + return &A.makeLiteral(false); + case 'V': { + unsigned AtomID; + if (Str.consumeInteger(10, AtomID)) + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "expected atom id"); + auto [It, Inserted] = AtomMap.try_emplace(AtomID, Atom()); + if (Inserted) + It->second = A.makeAtom(); + return &A.makeAtomRef(It->second); + } + case '!': { + auto OperandOrErr = parsePrefix(Str, A, AtomMap); + if (!OperandOrErr) + return OperandOrErr.takeError(); + return &A.makeNot(**OperandOrErr); + } + case '&': + case '|': + case '>': + case '=': { + auto LeftOrErr = parsePrefix(Str, A, AtomMap); + if (!LeftOrErr) + return LeftOrErr.takeError(); + + auto RightOrErr = parsePrefix(Str, A, AtomMap); + if (!RightOrErr) + return RightOrErr.takeError(); + + const Formula &LHS = **LeftOrErr; + const Formula &RHS = **RightOrErr; + + switch (Prefix) { + case '&': + return &A.makeAnd(LHS, RHS); + case '|': + return &A.makeOr(LHS, RHS); + case '>': + return &A.makeImplies(LHS, RHS); + case '=': + return &A.makeEquals(LHS, RHS); + default: + llvm_unreachable("unexpected binary op"); + } + } + default: + return llvm::createStringError(llvm::inconvertibleErrorCode(), + "unexpected prefix character: %c", Prefix); + } +} + +llvm::Expected +parseFormula(llvm::StringRef Str, Arena &A, + llvm::DenseMap &AtomMap) { + size_t OriginalSize = Str.size(); + llvm::Expected F = parsePrefix(Str, A, AtomMap); + if (!F) + return F.takeError(); + if (!Str.empty()) + return llvm::createStringError(llvm::inconvertibleErrorCode(), + ("unexpected suffix of length: " + + llvm::Twine(Str.size() - OriginalSize)) + .str()); + return F; +} + +} // namespace clang::dataflow diff --git a/clang/lib/Analysis/LifetimeSafety.cpp b/clang/lib/Analysis/LifetimeSafety.cpp index f39998cca56f..ba9f7d0f6ee3 100644 --- a/clang/lib/Analysis/LifetimeSafety.cpp +++ b/clang/lib/Analysis/LifetimeSafety.cpp @@ -45,10 +45,11 @@ struct Loan { /// is represented as empty LoanSet LoanID ID; AccessPath Path; - SourceLocation IssueLoc; + /// The expression that creates the loan, e.g., &x. + const Expr *IssueExpr; - Loan(LoanID id, AccessPath path, SourceLocation loc) - : ID(id), Path(path), IssueLoc(loc) {} + Loan(LoanID id, AccessPath path, const Expr *IssueExpr) + : ID(id), Path(path), IssueExpr(IssueExpr) {} }; /// An Origin is a symbolic identifier that represents the set of possible @@ -82,8 +83,8 @@ class LoanManager { public: LoanManager() = default; - Loan &addLoan(AccessPath Path, SourceLocation Loc) { - AllLoans.emplace_back(getNextLoanID(), Path, Loc); + Loan &addLoan(AccessPath Path, const Expr *IssueExpr) { + AllLoans.emplace_back(getNextLoanID(), Path, IssueExpr); return AllLoans.back(); } @@ -199,6 +200,8 @@ public: AssignOrigin, /// An origin escapes the function by flowing into the return value. ReturnOfOrigin, + /// An origin is used (eg. dereferencing a pointer). + Use, /// A marker for a specific point in the code, for testing. TestPoint, }; @@ -242,12 +245,17 @@ public: class ExpireFact : public Fact { LoanID LID; + SourceLocation ExpiryLoc; public: static bool classof(const Fact *F) { return F->getKind() == Kind::Expire; } - ExpireFact(LoanID LID) : Fact(Kind::Expire), LID(LID) {} + ExpireFact(LoanID LID, SourceLocation ExpiryLoc) + : Fact(Kind::Expire), LID(LID), ExpiryLoc(ExpiryLoc) {} + LoanID getLoanID() const { return LID; } + SourceLocation getExpiryLoc() const { return ExpiryLoc; } + void dump(llvm::raw_ostream &OS) const override { OS << "Expire (LoanID: " << getLoanID() << ")\n"; } @@ -287,6 +295,24 @@ public: } }; +class UseFact : public Fact { + OriginID UsedOrigin; + const Expr *UseExpr; + +public: + static bool classof(const Fact *F) { return F->getKind() == Kind::Use; } + + UseFact(OriginID UsedOrigin, const Expr *UseExpr) + : Fact(Kind::Use), UsedOrigin(UsedOrigin), UseExpr(UseExpr) {} + + OriginID getUsedOrigin() const { return UsedOrigin; } + const Expr *getUseExpr() const { return UseExpr; } + + void dump(llvm::raw_ostream &OS) const override { + OS << "Use (OriginID: " << UsedOrigin << ")\n"; + } +}; + /// A dummy-fact used to mark a specific point in the code for testing. /// It is generated by recognizing a `void("__lifetime_test_point_...")` cast. class TestPointFact : public Fact { @@ -417,13 +443,17 @@ public: if (VD->hasLocalStorage()) { OriginID OID = FactMgr.getOriginMgr().getOrCreate(*UO); AccessPath AddrOfLocalVarPath(VD); - const Loan &L = FactMgr.getLoanMgr().addLoan(AddrOfLocalVarPath, - UO->getOperatorLoc()); + const Loan &L = + FactMgr.getLoanMgr().addLoan(AddrOfLocalVarPath, UO); CurrentBlockFacts.push_back( FactMgr.createFact(L.ID, OID)); } } } + } else if (UO->getOpcode() == UO_Deref) { + // This is a pointer use, like '*p'. + OriginID OID = FactMgr.getOriginMgr().get(*UO->getSubExpr()); + CurrentBlockFacts.push_back(FactMgr.createFact(OID, UO)); } } @@ -492,7 +522,8 @@ private: // Check if the loan is for a stack variable and if that variable // is the one being destructed. if (LoanPath.D == DestructedVD) - CurrentBlockFacts.push_back(FactMgr.createFact(L.ID)); + CurrentBlockFacts.push_back(FactMgr.createFact( + L.ID, DtorOpt.getTriggerStmt()->getEndLoc())); } } @@ -618,6 +649,7 @@ public: } } +protected: Lattice getState(ProgramPoint P) const { return PerPointStates.lookup(P); } Lattice getInState(const CFGBlock *B) const { return InStates.lookup(B); } @@ -665,6 +697,8 @@ private: return D->transfer(In, *F->getAs()); case Fact::Kind::ReturnOfOrigin: return D->transfer(In, *F->getAs()); + case Fact::Kind::Use: + return D->transfer(In, *F->getAs()); case Fact::Kind::TestPoint: return D->transfer(In, *F->getAs()); } @@ -676,6 +710,7 @@ public: Lattice transfer(Lattice In, const ExpireFact &) { return In; } Lattice transfer(Lattice In, const AssignOriginFact &) { return In; } Lattice transfer(Lattice In, const ReturnOfOriginFact &) { return In; } + Lattice transfer(Lattice In, const UseFact &) { return In; } Lattice transfer(Lattice In, const TestPointFact &) { return In; } }; @@ -693,6 +728,20 @@ static llvm::ImmutableSet join(llvm::ImmutableSet A, return A; } +/// Checks if set A is a subset of set B. +template +static bool isSubsetOf(const llvm::ImmutableSet &A, + const llvm::ImmutableSet &B) { + // Empty set is a subset of all sets. + if (A.isEmpty()) + return true; + + for (const T &Elem : A) + if (!B.contains(Elem)) + return false; + return true; +} + /// Computes the key-wise union of two ImmutableMaps. // TODO(opt): This key-wise join is a performance bottleneck. A more // efficient merge could be implemented using a Patricia Trie or HAMT @@ -700,7 +749,7 @@ static llvm::ImmutableSet join(llvm::ImmutableSet A, template static llvm::ImmutableMap join(llvm::ImmutableMap A, llvm::ImmutableMap B, - typename llvm::ImmutableMap::Factory &F, Joiner joinValues) { + typename llvm::ImmutableMap::Factory &F, Joiner JoinValues) { if (A.getHeight() < B.getHeight()) std::swap(A, B); @@ -710,7 +759,7 @@ join(llvm::ImmutableMap A, llvm::ImmutableMap B, const K &Key = Entry.first; const V &ValB = Entry.second; if (const V *ValA = A.lookup(Key)) - A = F.add(A, Key, joinValues(*ValA, ValB)); + A = F.add(A, Key, JoinValues(*ValA, ValB)); else A = F.add(A, Key, ValB); } @@ -723,17 +772,14 @@ join(llvm::ImmutableMap A, llvm::ImmutableMap B, // ========================================================================= // using OriginLoanMap = llvm::ImmutableMap; +using ExpiredLoanMap = llvm::ImmutableMap; /// An object to hold the factories for immutable collections, ensuring /// that all created states share the same underlying memory management. struct LifetimeFactory { OriginLoanMap::Factory OriginMapFactory; LoanSet::Factory LoanSetFactory; - - /// Creates a singleton set containing only the given loan ID. - LoanSet createLoanSet(LoanID LID) { - return LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID); - } + ExpiredLoanMap::Factory ExpiredLoanMapFactory; }; /// Represents the dataflow lattice for loan propagation. @@ -774,13 +820,15 @@ struct LoanPropagationLattice { class LoanPropagationAnalysis : public DataflowAnalysis { - - LifetimeFactory &Factory; + OriginLoanMap::Factory &OriginLoanMapFactory; + LoanSet::Factory &LoanSetFactory; public: LoanPropagationAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F, - LifetimeFactory &Factory) - : DataflowAnalysis(C, AC, F), Factory(Factory) {} + LifetimeFactory &LFactory) + : DataflowAnalysis(C, AC, F), + OriginLoanMapFactory(LFactory.OriginMapFactory), + LoanSetFactory(LFactory.LoanSetFactory) {} using Base::transfer; @@ -792,9 +840,9 @@ public: // TODO(opt): Keep the state small by removing origins which become dead. Lattice join(Lattice A, Lattice B) { OriginLoanMap JoinedOrigins = - utils::join(A.Origins, B.Origins, Factory.OriginMapFactory, - [this](LoanSet S1, LoanSet S2) { - return utils::join(S1, S2, Factory.LoanSetFactory); + utils::join(A.Origins, B.Origins, OriginLoanMapFactory, + [&](LoanSet S1, LoanSet S2) { + return utils::join(S1, S2, LoanSetFactory); }); return Lattice(JoinedOrigins); } @@ -803,8 +851,9 @@ public: Lattice transfer(Lattice In, const IssueFact &F) { OriginID OID = F.getOriginID(); LoanID LID = F.getLoanID(); - return LoanPropagationLattice(Factory.OriginMapFactory.add( - In.Origins, OID, Factory.createLoanSet(LID))); + return LoanPropagationLattice(OriginLoanMapFactory.add( + In.Origins, OID, + LoanSetFactory.add(LoanSetFactory.getEmptySet(), LID))); } /// The destination origin's loan set is replaced by the source's. @@ -814,7 +863,7 @@ public: OriginID SrcOID = F.getSrcOriginID(); LoanSet SrcLoans = getLoans(In, SrcOID); return LoanPropagationLattice( - Factory.OriginMapFactory.add(In.Origins, DestOID, SrcLoans)); + OriginLoanMapFactory.add(In.Origins, DestOID, SrcLoans)); } LoanSet getLoans(OriginID OID, ProgramPoint P) { @@ -825,7 +874,7 @@ private: LoanSet getLoans(Lattice L, OriginID OID) { if (auto *Loans = L.Origins.lookup(OID)) return *Loans; - return Factory.LoanSetFactory.getEmptySet(); + return LoanSetFactory.getEmptySet(); } }; @@ -835,10 +884,11 @@ private: /// The dataflow lattice for tracking the set of expired loans. struct ExpiredLattice { - LoanSet Expired; + /// Map from an expired `LoanID` to the `ExpireFact` that made it expire. + ExpiredLoanMap Expired; ExpiredLattice() : Expired(nullptr) {}; - explicit ExpiredLattice(LoanSet S) : Expired(S) {} + explicit ExpiredLattice(ExpiredLoanMap M) : Expired(M) {} bool operator==(const ExpiredLattice &Other) const { return Expired == Other.Expired; @@ -851,8 +901,8 @@ struct ExpiredLattice { OS << "ExpiredLattice State:\n"; if (Expired.isEmpty()) OS << " \n"; - for (const LoanID &LID : Expired) - OS << " Loan " << LID << " is expired\n"; + for (const auto &[ID, _] : Expired) + OS << " Loan " << ID << " is expired\n"; } }; @@ -861,26 +911,31 @@ class ExpiredLoansAnalysis : public DataflowAnalysis { - LoanSet::Factory &Factory; + ExpiredLoanMap::Factory &Factory; public: ExpiredLoansAnalysis(const CFG &C, AnalysisDeclContext &AC, FactManager &F, LifetimeFactory &Factory) - : DataflowAnalysis(C, AC, F), Factory(Factory.LoanSetFactory) {} + : DataflowAnalysis(C, AC, F), Factory(Factory.ExpiredLoanMapFactory) {} using Base::transfer; StringRef getAnalysisName() const { return "ExpiredLoans"; } - Lattice getInitialState() { return Lattice(Factory.getEmptySet()); } + Lattice getInitialState() { return Lattice(Factory.getEmptyMap()); } - /// Merges two lattices by taking the union of the expired loan sets. - Lattice join(Lattice L1, Lattice L2) const { - return Lattice(utils::join(L1.Expired, L2.Expired, Factory)); + /// Merges two lattices by taking the union of the two expired loans. + Lattice join(Lattice L1, Lattice L2) { + return Lattice( + utils::join(L1.Expired, L2.Expired, Factory, + // Take the last expiry fact to make this hermetic. + [](const ExpireFact *F1, const ExpireFact *F2) { + return F1->getExpiryLoc() > F2->getExpiryLoc() ? F1 : F2; + })); } Lattice transfer(Lattice In, const ExpireFact &F) { - return Lattice(Factory.add(In.Expired, F.getLoanID())); + return Lattice(Factory.add(In.Expired, F.getLoanID(), &F)); } // Removes the loan from the set of expired loans. @@ -912,15 +967,116 @@ public: Lattice transfer(Lattice In, const IssueFact &F) { return Lattice(Factory.remove(In.Expired, F.getLoanID())); } + + ExpiredLoanMap getExpiredLoans(ProgramPoint P) { return getState(P).Expired; } }; // ========================================================================= // -// TODO: -// - Modify loan expiry analysis to answer `bool isExpired(Loan L, Point P)` -// - Modify origin liveness analysis to answer `bool isLive(Origin O, Point P)` -// - Using the above three to perform the final error reporting. +// Lifetime checker and Error reporter // ========================================================================= // +/// Struct to store the complete context for a potential lifetime violation. +struct PendingWarning { + SourceLocation ExpiryLoc; // Where the loan expired. + const Expr *UseExpr; // Where the origin holding this loan was used. + Confidence ConfidenceLevel; +}; + +class LifetimeChecker { +private: + llvm::DenseMap FinalWarningsMap; + LoanPropagationAnalysis &LoanPropagation; + ExpiredLoansAnalysis &ExpiredLoans; + FactManager &FactMgr; + AnalysisDeclContext &ADC; + LifetimeSafetyReporter *Reporter; + +public: + LifetimeChecker(LoanPropagationAnalysis &LPA, ExpiredLoansAnalysis &ELA, + FactManager &FM, AnalysisDeclContext &ADC, + LifetimeSafetyReporter *Reporter) + : LoanPropagation(LPA), ExpiredLoans(ELA), FactMgr(FM), ADC(ADC), + Reporter(Reporter) {} + + void run() { + llvm::TimeTraceScope TimeProfile("LifetimeChecker"); + for (const CFGBlock *B : *ADC.getAnalysis()) + for (const Fact *F : FactMgr.getFacts(B)) + if (const auto *UF = F->getAs()) + checkUse(UF); + issuePendingWarnings(); + } + + /// Checks for use-after-free errors for a given use of an Origin. + /// + /// This method is called for each 'UseFact' identified in the control flow + /// graph. It determines if the loans held by the used origin have expired + /// at the point of use. + void checkUse(const UseFact *UF) { + + OriginID O = UF->getUsedOrigin(); + + // Get the set of loans that the origin might hold at this program point. + LoanSet HeldLoans = LoanPropagation.getLoans(O, UF); + + // Get the set of all loans that have expired at this program point. + ExpiredLoanMap AllExpiredLoans = ExpiredLoans.getExpiredLoans(UF); + + // If the pointer holds no loans or no loans have expired, there's nothing + // to check. + if (HeldLoans.isEmpty() || AllExpiredLoans.isEmpty()) + return; + + // Identify loans that which have expired but are held by the pointer. Using + // them is a use-after-free. + llvm::SmallVector DefaultedLoans; + // A definite UaF error occurs if all loans the origin might hold have + // expired. + bool IsDefiniteError = true; + for (LoanID L : HeldLoans) { + if (AllExpiredLoans.contains(L)) + DefaultedLoans.push_back(L); + else + // If at least one loan is not expired, this use is not a definite UaF. + IsDefiniteError = false; + } + // If there are no defaulted loans, the use is safe. + if (DefaultedLoans.empty()) + return; + + // Determine the confidence level of the error (definite or maybe). + Confidence CurrentConfidence = + IsDefiniteError ? Confidence::Definite : Confidence::Maybe; + + // For each expired loan, create a pending warning. + for (LoanID DefaultedLoan : DefaultedLoans) { + // If we already have a warning for this loan with a higher or equal + // confidence, skip this one. + if (FinalWarningsMap.count(DefaultedLoan) && + CurrentConfidence <= FinalWarningsMap[DefaultedLoan].ConfidenceLevel) + continue; + + auto *EF = AllExpiredLoans.lookup(DefaultedLoan); + assert(EF && "Could not find ExpireFact for an expired loan."); + + FinalWarningsMap[DefaultedLoan] = {/*ExpiryLoc=*/(*EF)->getExpiryLoc(), + /*UseExpr=*/UF->getUseExpr(), + /*ConfidenceLevel=*/CurrentConfidence}; + } + } + + void issuePendingWarnings() { + if (!Reporter) + return; + for (const auto &[LID, Warning] : FinalWarningsMap) { + const Loan &L = FactMgr.getLoanMgr().getLoan(LID); + const Expr *IssueExpr = L.IssueExpr; + Reporter->reportUseAfterFree(IssueExpr, Warning.UseExpr, + Warning.ExpiryLoc, Warning.ConfidenceLevel); + } + } +}; + // ========================================================================= // // LifetimeSafetyAnalysis Class Implementation // ========================================================================= // @@ -928,8 +1084,9 @@ public: // We need this here for unique_ptr with forward declared class. LifetimeSafetyAnalysis::~LifetimeSafetyAnalysis() = default; -LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC) - : AC(AC), Factory(std::make_unique()), +LifetimeSafetyAnalysis::LifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter) + : AC(AC), Reporter(Reporter), Factory(std::make_unique()), FactMgr(std::make_unique()) {} void LifetimeSafetyAnalysis::run() { @@ -952,6 +1109,8 @@ void LifetimeSafetyAnalysis::run() { /// blocks; only Decls are visible. Therefore, loans in a block that /// never reach an Origin associated with a Decl can be safely dropped by /// the analysis. + /// 3. Collapse ExpireFacts belonging to same source location into a single + /// Fact. LoanPropagation = std::make_unique(Cfg, AC, *FactMgr, *Factory); LoanPropagation->run(); @@ -959,6 +1118,10 @@ void LifetimeSafetyAnalysis::run() { ExpiredLoans = std::make_unique(Cfg, AC, *FactMgr, *Factory); ExpiredLoans->run(); + + LifetimeChecker Checker(*LoanPropagation, *ExpiredLoans, *FactMgr, AC, + Reporter); + Checker.run(); } LoanSet LifetimeSafetyAnalysis::getLoansAtPoint(OriginID OID, @@ -967,9 +1130,13 @@ LoanSet LifetimeSafetyAnalysis::getLoansAtPoint(OriginID OID, return LoanPropagation->getLoans(OID, PP); } -LoanSet LifetimeSafetyAnalysis::getExpiredLoansAtPoint(ProgramPoint PP) const { +std::vector +LifetimeSafetyAnalysis::getExpiredLoansAtPoint(ProgramPoint PP) const { assert(ExpiredLoans && "ExpiredLoansAnalysis has not been run."); - return ExpiredLoans->getState(PP).Expired; + std::vector Result; + for (const auto &pair : ExpiredLoans->getExpiredLoans(PP)) + Result.push_back(pair.first); + return Result; } std::optional @@ -1009,8 +1176,9 @@ llvm::StringMap LifetimeSafetyAnalysis::getTestPoints() const { } } // namespace internal -void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC) { - internal::LifetimeSafetyAnalysis Analysis(AC); +void runLifetimeSafetyAnalysis(AnalysisDeclContext &AC, + LifetimeSafetyReporter *Reporter) { + internal::LifetimeSafetyAnalysis Analysis(AC, Reporter); Analysis.run(); } } // namespace clang::lifetimes diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp index 21fc084d1977..2fbf1ee39b78 100644 --- a/clang/lib/Basic/TargetInfo.cpp +++ b/clang/lib/Basic/TargetInfo.cpp @@ -62,7 +62,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) { TLSSupported = true; VLASupported = true; NoAsmVariants = false; - HasLegalHalfType = false; + HasFastHalfType = false; HalfArgsAndReturns = false; HasFloat128 = false; HasIbm128 = false; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp index 2b023e5fdb7d..9e03a0846ffb 100644 --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -142,7 +142,7 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple, AddrSpaceMap = &ARM64AddrSpaceMap; // All AArch64 implementations support ARMv8 FP, which makes half a legal type. - HasLegalHalfType = true; + HasFastHalfType = true; HalfArgsAndReturns = true; HasFloat16 = true; HasStrictFP = true; diff --git a/clang/lib/Basic/Targets/AMDGPU.cpp b/clang/lib/Basic/Targets/AMDGPU.cpp index 52cbdbc3719d..639e735202f2 100644 --- a/clang/lib/Basic/Targets/AMDGPU.cpp +++ b/clang/lib/Basic/Targets/AMDGPU.cpp @@ -251,7 +251,7 @@ AMDGPUTargetInfo::AMDGPUTargetInfo(const llvm::Triple &Triple, BFloat16Format = &llvm::APFloat::BFloat(); } - HasLegalHalfType = true; + HasFastHalfType = true; HasFloat16 = true; WavefrontSize = (GPUFeatures & llvm::AMDGPU::FEATURE_WAVE32) ? 32 : 64; diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp index 75fdf38e2104..3de17d2c829f 100644 --- a/clang/lib/Basic/Targets/ARM.cpp +++ b/clang/lib/Basic/Targets/ARM.cpp @@ -585,13 +585,13 @@ bool ARMTargetInfo::handleTargetFeatures(std::vector &Features, } else if (Feature == "+fp16") { HW_FP |= HW_FP_HP; } else if (Feature == "+fullfp16") { - HasLegalHalfType = true; + HasFastHalfType = true; } else if (Feature == "+dotprod") { DotProd = true; } else if (Feature == "+mve") { MVE |= MVE_INT; } else if (Feature == "+mve.fp") { - HasLegalHalfType = true; + HasFastHalfType = true; FPU |= FPARMV8; MVE |= MVE_INT | MVE_FP; HW_FP |= HW_FP_SP | HW_FP_HP; @@ -1014,11 +1014,11 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts, Builder.defineMacro("__ARM_FP_FAST", "1"); // Armv8.2-A FP16 vector intrinsic - if ((FPU & NeonFPU) && HasLegalHalfType) + if ((FPU & NeonFPU) && HasFastHalfType) Builder.defineMacro("__ARM_FEATURE_FP16_VECTOR_ARITHMETIC", "1"); // Armv8.2-A FP16 scalar intrinsics - if (HasLegalHalfType) + if (HasFastHalfType) Builder.defineMacro("__ARM_FEATURE_FP16_SCALAR_ARITHMETIC", "1"); // Armv8.2-A dot product intrinsics diff --git a/clang/lib/Basic/Targets/DirectX.h b/clang/lib/Basic/Targets/DirectX.h index 17240cf35890..bd13c9ee0fd0 100644 --- a/clang/lib/Basic/Targets/DirectX.h +++ b/clang/lib/Basic/Targets/DirectX.h @@ -59,7 +59,7 @@ public: VLASupported = false; AddrSpaceMap = &DirectXAddrSpaceMap; UseAddrSpaceMapMangling = true; - HasLegalHalfType = true; + HasFastHalfType = true; HasFloat16 = true; NoAsmVariants = true; PlatformMinVersion = Triple.getOSVersion(); diff --git a/clang/lib/Basic/Targets/Hexagon.cpp b/clang/lib/Basic/Targets/Hexagon.cpp index 06dcac03baa5..cea64f986003 100644 --- a/clang/lib/Basic/Targets/Hexagon.cpp +++ b/clang/lib/Basic/Targets/Hexagon.cpp @@ -149,7 +149,7 @@ bool HexagonTargetInfo::handleTargetFeatures(std::vector &Features, HasAudio = true; } if (CPU.compare("hexagonv68") >= 0) { - HasLegalHalfType = true; + HasFastHalfType = true; HasFloat16 = true; } return true; diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 79995ccc21b2..5cf2dc187b83 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -65,7 +65,7 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple, GPU = OffloadArch::UNUSED; // PTX supports f16 as a fundamental type. - HasLegalHalfType = true; + HasFastHalfType = true; HasFloat16 = true; if (TargetPointerWidth == 32) diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp index a6a5ec4b325b..04da4e637af5 100644 --- a/clang/lib/Basic/Targets/RISCV.cpp +++ b/clang/lib/Basic/Targets/RISCV.cpp @@ -427,7 +427,7 @@ bool RISCVTargetInfo::handleTargetFeatures(std::vector &Features, ABI = ISAInfo->computeDefaultABI().str(); if (ISAInfo->hasExtension("zfh") || ISAInfo->hasExtension("zhinx")) - HasLegalHalfType = true; + HasFastHalfType = true; FastScalarUnalignedAccess = llvm::is_contained(Features, "+unaligned-scalar-mem"); diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 9d0ced2afdbc..fb15b7706554 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -106,7 +106,7 @@ protected: LongWidth = LongAlign = 64; AddrSpaceMap = &SPIRDefIsPrivMap; UseAddrSpaceMapMangling = true; - HasLegalHalfType = true; + HasFastHalfType = true; HasFloat16 = true; // Define available target features // These must be defined in sorted order! @@ -427,7 +427,7 @@ public: BFloat16Width = BFloat16Align = 16; BFloat16Format = &llvm::APFloat::BFloat(); - HasLegalHalfType = true; + HasFastHalfType = true; HasFloat16 = true; HalfArgsAndReturns = true; diff --git a/clang/lib/Basic/Targets/SystemZ.h b/clang/lib/Basic/Targets/SystemZ.h index 7f7dcf815bd8..dc2185e1b45c 100644 --- a/clang/lib/Basic/Targets/SystemZ.h +++ b/clang/lib/Basic/Targets/SystemZ.h @@ -104,7 +104,7 @@ public: // -ffloat16-excess-precision=none is given, no conversions will be made // and instead the backend will promote each half operation to float // individually. - HasLegalHalfType = false; + HasFastHalfType = false; HasStrictFP = true; } diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 24ecec24d2a4..dc6f65540e2f 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -348,7 +348,7 @@ bool X86TargetInfo::handleTargetFeatures(std::vector &Features, HasAVX512BF16 = true; } else if (Feature == "+avx512fp16") { HasAVX512FP16 = true; - HasLegalHalfType = true; + HasFastHalfType = true; } else if (Feature == "+avx512dq") { HasAVX512DQ = true; } else if (Feature == "+avx512bitalg") { diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 59d2adc15a01..c1088c4cd082 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -84,6 +84,10 @@ public: llvm_unreachable("Unsupported format for long double"); } + mlir::Type getPtrToVPtrType() { + return getPointerTo(cir::VPtrType::get(getContext())); + } + /// Get a CIR record kind from a AST declaration tag. cir::RecordType::RecordKind getRecordKind(const clang::TagTypeKind kind) { switch (kind) { @@ -263,6 +267,9 @@ public: cir::ConstantOp getSInt32(int32_t c, mlir::Location loc) { return getConstantInt(loc, getSInt32Ty(), c); } + cir::ConstantOp getUInt32(uint32_t c, mlir::Location loc) { + return getConstantInt(loc, getUInt32Ty(), c); + } // Creates constant nullptr for pointer type ty. cir::ConstantOp getNullPtr(mlir::Type ty, mlir::Location loc) { diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp index 36aea4c1d39c..dcd00696f335 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -312,6 +312,20 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, case Builtin::BI__builtin_rotateright64: return emitRotate(e, /*isRotateLeft=*/false); + case Builtin::BI__builtin_return_address: + case Builtin::BI__builtin_frame_address: { + mlir::Location loc = getLoc(e->getExprLoc()); + llvm::APSInt level = e->getArg(0)->EvaluateKnownConstInt(getContext()); + if (builtinID == Builtin::BI__builtin_return_address) { + return RValue::get(cir::ReturnAddrOp::create( + builder, loc, + builder.getConstAPInt(loc, builder.getUInt32Ty(), level))); + } + return RValue::get(cir::FrameAddrOp::create( + builder, loc, + builder.getConstAPInt(loc, builder.getUInt32Ty(), level))); + } + case Builtin::BI__builtin_trap: emitTrap(loc, /*createNewBlock=*/true); return RValue::get(nullptr); diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp index 31c93cd00d08..a3947047de07 100644 --- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp @@ -289,7 +289,7 @@ void CIRGenFunction::initializeVTablePointer(mlir::Location loc, } // Apply the offsets. - Address vtableField = loadCXXThisAddress(); + Address classAddr = loadCXXThisAddress(); if (!nonVirtualOffset.isZero() || virtualOffset) { cgm.errorNYI(loc, "initializeVTablePointer: non-virtual and virtual offset"); @@ -300,9 +300,9 @@ void CIRGenFunction::initializeVTablePointer(mlir::Location loc, // vtable field is derived from `this` pointer, therefore they should be in // the same addr space. assert(!cir::MissingFeatures::addressSpace()); - // TODO(cir): This should be cir.vtable.get_vptr. - vtableField = builder.createElementBitCast(loc, vtableField, - vtableAddressPoint.getType()); + auto vtablePtr = cir::VTableGetVPtrOp::create( + builder, loc, builder.getPtrToVPtrType(), classAddr.getPointer()); + Address vtableField = Address(vtablePtr, classAddr.getAlignment()); builder.createStore(loc, vtableAddressPoint, vtableField); assert(!cir::MissingFeatures::opTBAA()); assert(!cir::MissingFeatures::createInvariantGroup()); @@ -657,6 +657,23 @@ Address CIRGenFunction::getAddressOfBaseClass( return value; } +mlir::Value CIRGenFunction::getVTablePtr(mlir::Location loc, Address thisAddr, + const CXXRecordDecl *rd) { + auto vtablePtr = cir::VTableGetVPtrOp::create( + builder, loc, builder.getPtrToVPtrType(), thisAddr.getPointer()); + Address vtablePtrAddr = Address(vtablePtr, thisAddr.getAlignment()); + + auto vtable = builder.createLoad(loc, vtablePtrAddr); + assert(!cir::MissingFeatures::opTBAA()); + + if (cgm.getCodeGenOpts().OptimizationLevel > 0 && + cgm.getCodeGenOpts().StrictVTablePointers) { + assert(!cir::MissingFeatures::createInvariantGroup()); + } + + return vtable; +} + void CIRGenFunction::emitCXXConstructorCall(const clang::CXXConstructorDecl *d, clang::CXXCtorType type, bool forVirtualBase, diff --git a/clang/lib/CIR/CodeGen/CIRGenConstantEmitter.h b/clang/lib/CIR/CodeGen/CIRGenConstantEmitter.h index d6dac50bb126..d455f6e28340 100644 --- a/clang/lib/CIR/CodeGen/CIRGenConstantEmitter.h +++ b/clang/lib/CIR/CodeGen/CIRGenConstantEmitter.h @@ -80,7 +80,7 @@ public: // initializer or to propagate to another context; for example, // side effects, or emitting an initialization that requires a // reference to its current location. - mlir::Attribute emitForMemory(mlir::Attribute c, QualType t); + mlir::Attribute emitForMemory(mlir::Attribute c, QualType destType); /// Try to emit the initializer of the given declaration as an abstract /// constant. @@ -90,8 +90,9 @@ public: /// asserting that it succeeded. This is only safe to do when the /// expression is known to be a constant expression with either a fairly /// simple type or a known simple form. + mlir::Attribute emitAbstract(const Expr *e, QualType destType); mlir::Attribute emitAbstract(SourceLocation loc, const APValue &value, - QualType t); + QualType destType); mlir::Attribute tryEmitConstantExpr(const ConstantExpr *ce); @@ -101,6 +102,7 @@ public: mlir::Attribute tryEmitPrivateForVarInit(const VarDecl &d); + mlir::TypedAttr tryEmitPrivate(const Expr *e, QualType destType); mlir::Attribute tryEmitPrivate(const APValue &value, QualType destType); mlir::Attribute tryEmitPrivateForMemory(const APValue &value, QualType t); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp index c2b373417392..2fbf69d5d01f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp @@ -710,6 +710,16 @@ mlir::Attribute ConstantEmitter::tryEmitPrivateForMemory(const APValue &value, return (c ? emitForMemory(c, destType) : nullptr); } +mlir::Attribute ConstantEmitter::emitAbstract(const Expr *e, + QualType destType) { + AbstractStateRAII state{*this, true}; + mlir::Attribute c = mlir::cast(tryEmitPrivate(e, destType)); + if (!c) + cgm.errorNYI(e->getSourceRange(), + "emitAbstract failed, emit null constaant"); + return c; +} + mlir::Attribute ConstantEmitter::emitAbstract(SourceLocation loc, const APValue &value, QualType destType) { @@ -731,6 +741,32 @@ mlir::Attribute ConstantEmitter::emitForMemory(mlir::Attribute c, return c; } +mlir::TypedAttr ConstantEmitter::tryEmitPrivate(const Expr *e, + QualType destType) { + assert(!destType->isVoidType() && "can't emit a void constant"); + + if (mlir::Attribute c = + ConstExprEmitter(*this).Visit(const_cast(e), destType)) + return llvm::dyn_cast(c); + + Expr::EvalResult result; + + bool success = false; + + if (destType->isReferenceType()) + success = e->EvaluateAsLValue(result, cgm.getASTContext()); + else + success = + e->EvaluateAsRValue(result, cgm.getASTContext(), inConstantContext); + + if (success && !result.hasSideEffects()) { + mlir::Attribute c = tryEmitPrivate(result.Val, destType); + return llvm::dyn_cast(c); + } + + return nullptr; +} + mlir::Attribute ConstantEmitter::tryEmitPrivate(const APValue &value, QualType destType) { auto &builder = cgm.getBuilder(); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 9a887ec047f8..554e46414c9a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -1120,6 +1120,8 @@ public: mlir::LogicalResult emitFunctionBody(const clang::Stmt *body); + mlir::LogicalResult emitGotoStmt(const clang::GotoStmt &s); + void emitImplicitAssignmentOperatorBody(FunctionArgList &args); void emitInitializerForField(clang::FieldDecl *field, LValue lhs, diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp index 9194b522114b..72e2c533254c 100644 --- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp @@ -387,6 +387,20 @@ class OpenACCClauseCIREmitter final return recipeName; } + void createFirstprivateRecipeCopy( + mlir::Location loc, mlir::Location locEnd, mlir::Value mainOp, + CIRGenFunction::AutoVarEmission tempDeclEmission, + mlir::acc::FirstprivateRecipeOp recipe, const VarDecl *varRecipe, + const VarDecl *temporary) { + builder.createBlock(&recipe.getCopyRegion(), recipe.getCopyRegion().end(), + {mainOp.getType(), mainOp.getType()}, {loc, loc}); + builder.setInsertionPointToEnd(&recipe.getCopyRegion().back()); + + // TODO: OpenACC: Implement this copy to actually do something. + + mlir::acc::YieldOp::create(builder, locEnd); + } + // Create the 'init' section of the recipe, including the 'copy' section for // 'firstprivate'. template @@ -401,12 +415,6 @@ class OpenACCClauseCIREmitter final cgf.cgm.errorNYI(exprRange, "OpenACC Reduction recipe init"); } - if constexpr (std::is_same_v) { - // We haven't implemented the 'init'/copy recipe for firstprivate yet, so - // NYI it. - cgf.cgm.errorNYI(exprRange, "OpenACC firstprivate recipe init"); - } - CIRGenFunction::AutoVarEmission tempDeclEmission{ CIRGenFunction::AutoVarEmission::invalid()}; @@ -442,17 +450,12 @@ class OpenACCClauseCIREmitter final mlir::acc::YieldOp::create(builder, locEnd); if constexpr (std::is_same_v) { - if (!varRecipe->getInit()) { - // If we don't have any initialization recipe, we failed during Sema to - // initialize this correctly. If we disable the - // Sema::TentativeAnalysisScopes in SemaOpenACC::CreateInitRecipe, it'll - // emit an error to tell us. However, emitting those errors during - // production is a violation of the standard, so we cannot do them. - cgf.cgm.errorNYI( - exprRange, "firstprivate copy-init recipe not properly generated"); - } - - cgf.cgm.errorNYI(exprRange, "firstprivate copy section generation"); + // TODO: OpenACC: we should have a errorNYI call here if + // !varRecipe->getInit(), but as that generation isn't currently + // implemented, it ends up being too noisy. So when we implement copy-init + // generation both in Sema and here, we should have a diagnostic here. + createFirstprivateRecipeCopy(loc, locEnd, mainOp, tempDeclEmission, + recipe, varRecipe, temporary); } // Make sure we cleanup after ourselves here. @@ -1155,6 +1158,43 @@ public: llvm_unreachable("Unknown construct kind in VisitPrivateClause"); } } + + void VisitFirstPrivateClause(const OpenACCFirstPrivateClause &clause) { + if constexpr (isOneOfTypes) { + for (const auto [varExpr, varRecipe] : + llvm::zip_equal(clause.getVarList(), clause.getInitRecipes())) { + CIRGenFunction::OpenACCDataOperandInfo opInfo = + cgf.getOpenACCDataOperandInfo(varExpr); + auto firstPrivateOp = mlir::acc::FirstprivateOp::create( + builder, opInfo.beginLoc, opInfo.varValue, /*structured=*/true, + /*implicit=*/false, opInfo.name, opInfo.bounds); + + firstPrivateOp.setDataClause(mlir::acc::DataClause::acc_firstprivate); + + { + mlir::OpBuilder::InsertionGuard guardCase(builder); + auto recipe = getOrCreateRecipe( + cgf.getContext(), varExpr, varRecipe.RecipeDecl, + varRecipe.InitFromTemporary, + Decl::castToDeclContext(cgf.curFuncDecl), opInfo.baseType, + firstPrivateOp.getResult()); + + // TODO: OpenACC: The dialect is going to change in the near future to + // have these be on a different operation, so when that changes, we + // probably need to change these here. + operation.addFirstPrivatization(builder.getContext(), firstPrivateOp, + recipe); + } + } + } else if constexpr (isCombinedType) { + // Unlike 'private', 'firstprivate' applies to the compute op, not the + // loop op. + applyToComputeOp(clause); + } else { + llvm_unreachable("Unknown construct kind in VisitFirstPrivateClause"); + } + } }; template diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp index d1e4a1482401..d83018e51370 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp @@ -252,6 +252,8 @@ mlir::LogicalResult CIRGenFunction::emitSimpleStmt(const Stmt *s, else emitCompoundStmt(cast(*s)); break; + case Stmt::GotoStmtClass: + return emitGotoStmt(cast(*s)); case Stmt::ContinueStmtClass: return emitContinueStmt(cast(*s)); @@ -435,6 +437,24 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) { return mlir::success(); } +mlir::LogicalResult CIRGenFunction::emitGotoStmt(const clang::GotoStmt &s) { + // FIXME: LLVM codegen inserts emit a stop point here for debug info + // sake when the insertion point is available, but doesn't do + // anything special when there isn't. We haven't implemented debug + // info support just yet, look at this again once we have it. + assert(!cir::MissingFeatures::generateDebugInfo()); + + cir::GotoOp::create(builder, getLoc(s.getSourceRange()), + s.getLabel()->getName()); + + // A goto marks the end of a block, create a new one for codegen after + // emitGotoStmt can resume building in that block. + // Insert the new block to continue codegen after goto. + builder.createBlock(builder.getBlock()->getParent()); + + return mlir::success(); +} + mlir::LogicalResult CIRGenFunction::emitContinueStmt(const clang::ContinueStmt &s) { builder.createContinue(getLoc(s.getContinueLoc())); diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp index 50246007b107..220927601f74 100644 --- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp +++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp @@ -22,6 +22,8 @@ #include "clang/CIR/Dialect/IR/CIROpsDialect.cpp.inc" #include "clang/CIR/Dialect/IR/CIROpsEnums.cpp.inc" #include "clang/CIR/MissingFeatures.h" +#include "llvm/ADT/SetOperations.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Support/LogicalResult.h" #include @@ -1647,9 +1649,28 @@ void cir::FuncOp::print(OpAsmPrinter &p) { } } -// TODO(CIR): The properties of functions that require verification haven't -// been implemented yet. -mlir::LogicalResult cir::FuncOp::verify() { return success(); } +mlir::LogicalResult cir::FuncOp::verify() { + + llvm::SmallSet labels; + llvm::SmallSet gotos; + + getOperation()->walk([&](mlir::Operation *op) { + if (auto lab = dyn_cast(op)) { + labels.insert(lab.getLabel()); + } else if (auto goTo = dyn_cast(op)) { + gotos.insert(goTo.getLabel()); + } + }); + + if (!labels.empty() || !gotos.empty()) { + llvm::SmallSet mismatched = + llvm::set_difference(gotos, labels); + + if (!mismatched.empty()) + return emitOpError() << "goto/label mismatch"; + } + return success(); +} //===----------------------------------------------------------------------===// // BinOp diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 1ea296a6887e..49784b46d351 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -267,6 +267,26 @@ void convertSideEffectForCall(mlir::Operation *callOp, bool isNothrow, } } +static mlir::LLVM::CallIntrinsicOp +createCallLLVMIntrinsicOp(mlir::ConversionPatternRewriter &rewriter, + mlir::Location loc, const llvm::Twine &intrinsicName, + mlir::Type resultTy, mlir::ValueRange operands) { + auto intrinsicNameAttr = + mlir::StringAttr::get(rewriter.getContext(), intrinsicName); + return mlir::LLVM::CallIntrinsicOp::create(rewriter, loc, resultTy, + intrinsicNameAttr, operands); +} + +static mlir::LLVM::CallIntrinsicOp replaceOpWithCallLLVMIntrinsicOp( + mlir::ConversionPatternRewriter &rewriter, mlir::Operation *op, + const llvm::Twine &intrinsicName, mlir::Type resultTy, + mlir::ValueRange operands) { + mlir::LLVM::CallIntrinsicOp callIntrinOp = createCallLLVMIntrinsicOp( + rewriter, op->getLoc(), intrinsicName, resultTy, operands); + rewriter.replaceOp(op, callIntrinOp.getOperation()); + return callIntrinOp; +} + /// IntAttr visitor. mlir::Value CIRAttrToValue::visitCirAttr(cir::IntAttr intAttr) { mlir::Location loc = parentOp->getLoc(); @@ -1112,6 +1132,24 @@ mlir::LogicalResult CIRToLLVMCallOpLowering::matchAndRewrite( getTypeConverter(), op.getCalleeAttr()); } +mlir::LogicalResult CIRToLLVMReturnAddrOpLowering::matchAndRewrite( + cir::ReturnAddrOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext()); + replaceOpWithCallLLVMIntrinsicOp(rewriter, op, "llvm.returnaddress", + llvmPtrTy, adaptor.getOperands()); + return mlir::success(); +} + +mlir::LogicalResult CIRToLLVMFrameAddrOpLowering::matchAndRewrite( + cir::FrameAddrOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(rewriter.getContext()); + replaceOpWithCallLLVMIntrinsicOp(rewriter, op, "llvm.frameaddress", llvmPtrTy, + adaptor.getOperands()); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite( cir::LoadOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { @@ -2322,10 +2360,12 @@ void ConvertCIRToLLVMPass::runOnOperation() { CIRToLLVMConstantOpLowering, CIRToLLVMExpectOpLowering, CIRToLLVMFAbsOpLowering, + CIRToLLVMFrameAddrOpLowering, CIRToLLVMFuncOpLowering, CIRToLLVMGetBitfieldOpLowering, CIRToLLVMGetGlobalOpLowering, CIRToLLVMGetMemberOpLowering, + CIRToLLVMReturnAddrOpLowering, CIRToLLVMRotateOpLowering, CIRToLLVMSelectOpLowering, CIRToLLVMSetBitfieldOpLowering, @@ -2344,7 +2384,8 @@ void ConvertCIRToLLVMPass::runOnOperation() { CIRToLLVMVecShuffleOpLowering, CIRToLLVMVecSplatOpLowering, CIRToLLVMVecTernaryOpLowering, - CIRToLLVMVTableAddrPointOpLowering + CIRToLLVMVTableAddrPointOpLowering, + CIRToLLVMVTableGetVPtrOpLowering // clang-format on >(converter, patterns.getContext()); @@ -2468,6 +2509,18 @@ mlir::LogicalResult CIRToLLVMVTableAddrPointOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMVTableGetVPtrOpLowering::matchAndRewrite( + cir::VTableGetVPtrOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + // cir.vtable.get_vptr is equivalent to a bitcast from the source object + // pointer to the vptr type. Since the LLVM dialect uses opaque pointers + // we can just replace uses of this operation with the original pointer. + mlir::Value srcVal = adaptor.getSrc(); + rewriter.replaceAllUsesWith(op, srcVal); + rewriter.eraseOp(op); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMStackSaveOpLowering::matchAndRewrite( cir::StackSaveOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index e32bf2d1bae0..7a480d2d4d77 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -209,6 +209,26 @@ public: mlir::ConversionPatternRewriter &rewriter) const override; }; +class CIRToLLVMReturnAddrOpLowering + : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::ReturnAddrOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + +class CIRToLLVMFrameAddrOpLowering + : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::FrameAddrOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMAllocaOpLowering : public mlir::OpConversionPattern { mlir::DataLayout const &dataLayout; @@ -467,6 +487,16 @@ public: mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMVTableGetVPtrOpLowering + : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::VTableGetVPtrOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMStackSaveOpLowering : public mlir::OpConversionPattern { public: diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index 980f7eb714bb..60413e7b18e8 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -358,7 +358,7 @@ static Value *emitCallMaybeConstrainedFPBuiltin(CodeGenFunction &CGF, static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF, NeonTypeFlags TypeFlags, - bool HasLegalHalfType = true, + bool HasFastHalfType = true, bool V1Ty = false, bool AllowBFloatArgsAndRet = true) { int IsQuad = TypeFlags.isQuad(); @@ -376,7 +376,7 @@ static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF, else return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); case NeonTypeFlags::Float16: - if (HasLegalHalfType) + if (HasFastHalfType) return llvm::FixedVectorType::get(CGF->HalfTy, V1Ty ? 1 : (4 << IsQuad)); else return llvm::FixedVectorType::get(CGF->Int16Ty, V1Ty ? 1 : (4 << IsQuad)); @@ -1754,12 +1754,12 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( const bool Usgn = Type.isUnsigned(); const bool Quad = Type.isQuad(); const bool Floating = Type.isFloatingPoint(); - const bool HasLegalHalfType = getTarget().hasLegalHalfType(); + const bool HasFastHalfType = getTarget().hasFastHalfType(); const bool AllowBFloatArgsAndRet = getTargetHooks().getABIInfo().allowBFloatArgsAndRet(); llvm::FixedVectorType *VTy = - GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet); + GetNeonType(this, Type, HasFastHalfType, false, AllowBFloatArgsAndRet); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -1886,7 +1886,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vcvtq_f32_v: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float32, false, Quad), - HasLegalHalfType); + HasFastHalfType); return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); case NEON::BI__builtin_neon_vcvt_f16_s16: @@ -1895,7 +1895,7 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vcvtq_f16_u16: Ops[0] = Builder.CreateBitCast(Ops[0], Ty); Ty = GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float16, false, Quad), - HasLegalHalfType); + HasFastHalfType); return Usgn ? Builder.CreateUIToFP(Ops[0], Ty, "vcvt") : Builder.CreateSIToFP(Ops[0], Ty, "vcvt"); case NEON::BI__builtin_neon_vcvt_n_f16_s16: @@ -3211,7 +3211,7 @@ Value *CodeGenFunction::EmitARMBuiltinExpr(unsigned BuiltinID, bool rightShift = false; llvm::FixedVectorType *VTy = - GetNeonType(this, Type, getTarget().hasLegalHalfType(), false, + GetNeonType(this, Type, getTarget().hasFastHalfType(), false, getTarget().hasBFloat16Type()); llvm::Type *Ty = VTy; if (!Ty) diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp index 532ba4cead24..3739e16788c3 100644 --- a/clang/lib/CodeGen/Targets/ARM.cpp +++ b/clang/lib/CodeGen/Targets/ARM.cpp @@ -316,7 +316,7 @@ ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty, // Base can be a floating-point or a vector. if (const VectorType *VT = Base->getAs()) { // FP16 vectors should be converted to integer vectors - if (!getTarget().hasLegalHalfType() && containsAnyFP16Vectors(Ty)) { + if (!getTarget().hasFastHalfType() && containsAnyFP16Vectors(Ty)) { uint64_t Size = getContext().getTypeSize(VT); auto *NewVecTy = llvm::FixedVectorType::get( llvm::Type::getInt32Ty(getVMContext()), Size / 32); @@ -582,7 +582,7 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic, getDataLayout().getAllocaAddrSpace()); // TODO: FP16/BF16 vectors should be converted to integer vectors // This check is similar to isIllegalVectorType - refactor? - if ((!getTarget().hasLegalHalfType() && + if ((!getTarget().hasFastHalfType() && (VT->getElementType()->isFloat16Type() || VT->getElementType()->isHalfType())) || (IsFloatABISoftFP && @@ -679,9 +679,9 @@ bool ARMABIInfo::isIllegalVectorType(QualType Ty) const { // into float, and we don't want the ABI to depend on whether or not they // are supported in hardware. Thus return false to coerce vectors of these // types into integer vectors. - // We do not depend on hasLegalHalfType for bfloat as it is a + // We do not depend on hasFastHalfType for bfloat as it is a // separate IR type. - if ((!getTarget().hasLegalHalfType() && + if ((!getTarget().hasFastHalfType() && (VT->getElementType()->isFloat16Type() || VT->getElementType()->isHalfType())) || (IsFloatABISoftFP && diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 547e3156f519..65391033c2b9 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -178,6 +178,8 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_fstack_repack_arrays, options::OPT_fno_stack_repack_arrays, options::OPT_ftime_report, options::OPT_ftime_report_EQ, options::OPT_funroll_loops, options::OPT_fno_unroll_loops}); + if (Args.hasArg(clang::driver::options::OPT_fcoarray)) + CmdArgs.push_back("-fcoarray"); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 9a10403b858f..888d0faf8093 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -629,9 +629,16 @@ bool ContinuationIndenter::mustBreak(const LineState &State) { // name. !Style.isJavaScript() && Previous.isNot(tok::kw_template) && CurrentState.BreakBeforeParameter) { - for (const auto *Tok = &Previous; Tok; Tok = Tok->Previous) - if (Tok->FirstAfterPPLine || Tok->is(TT_LineComment)) + for (const auto *Tok = &Previous; Tok; Tok = Tok->Previous) { + if (Tok->is(TT_LineComment)) return false; + if (Tok->is(TT_TemplateCloser)) { + Tok = Tok->MatchingParen; + assert(Tok); + } + if (Tok->FirstAfterPPLine) + return false; + } return true; } diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp index 063780721423..e3b22cdabacc 100644 --- a/clang/lib/Format/Format.cpp +++ b/clang/lib/Format/Format.cpp @@ -763,6 +763,15 @@ struct ScalarEnumerationTraits { } }; +template <> +struct ScalarEnumerationTraits { + static void enumeration(IO &IO, FormatStyle::SpaceInEmptyBracesStyle &Value) { + IO.enumCase(Value, "Always", FormatStyle::SIEB_Always); + IO.enumCase(Value, "Block", FormatStyle::SIEB_Block); + IO.enumCase(Value, "Never", FormatStyle::SIEB_Never); + } +}; + template <> struct ScalarEnumerationTraits { static void enumeration(IO &IO, FormatStyle::SpacesInAnglesStyle &Value) { IO.enumCase(Value, "Never", FormatStyle::SIAS_Never); @@ -931,6 +940,7 @@ template <> struct MappingTraits { bool DeriveLineEnding = true; bool UseCRLF = false; + bool SpaceInEmptyBlock = false; bool SpaceInEmptyParentheses = false; bool SpacesInConditionalStatement = false; bool SpacesInCStyleCastParentheses = false; @@ -960,6 +970,7 @@ template <> struct MappingTraits { IO.mapOptional("PointerBindsToType", Style.PointerAlignment); IO.mapOptional("SpaceAfterControlStatementKeyword", Style.SpaceBeforeParens); + IO.mapOptional("SpaceInEmptyBlock", SpaceInEmptyBlock); IO.mapOptional("SpaceInEmptyParentheses", SpaceInEmptyParentheses); IO.mapOptional("SpacesInConditionalStatement", SpacesInConditionalStatement); @@ -1193,7 +1204,7 @@ template <> struct MappingTraits { Style.SpaceBeforeRangeBasedForLoopColon); IO.mapOptional("SpaceBeforeSquareBrackets", Style.SpaceBeforeSquareBrackets); - IO.mapOptional("SpaceInEmptyBlock", Style.SpaceInEmptyBlock); + IO.mapOptional("SpaceInEmptyBraces", Style.SpaceInEmptyBraces); IO.mapOptional("SpacesBeforeTrailingComments", Style.SpacesBeforeTrailingComments); IO.mapOptional("SpacesInAngles", Style.SpacesInAngles); @@ -1276,6 +1287,13 @@ template <> struct MappingTraits { Style.LineEnding = FormatStyle::LE_DeriveCRLF; } + // If SpaceInEmptyBlock was specified but SpaceInEmptyBraces was not, + // initialize the latter from the former for backward compatibility. + if (SpaceInEmptyBlock && + Style.SpaceInEmptyBraces == FormatStyle::SIEB_Never) { + Style.SpaceInEmptyBraces = FormatStyle::SIEB_Block; + } + if (Style.SpacesInParens != FormatStyle::SIPO_Custom && (SpacesInParentheses || SpaceInEmptyParentheses || SpacesInConditionalStatement || SpacesInCStyleCastParentheses)) { @@ -1677,7 +1695,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) { LLVMStyle.SpaceBeforeParensOptions.AfterIfMacros = true; LLVMStyle.SpaceBeforeRangeBasedForLoopColon = true; LLVMStyle.SpaceBeforeSquareBrackets = false; - LLVMStyle.SpaceInEmptyBlock = false; + LLVMStyle.SpaceInEmptyBraces = FormatStyle::SIEB_Never; LLVMStyle.SpacesBeforeTrailingComments = 1; LLVMStyle.SpacesInAngles = FormatStyle::SIAS_Never; LLVMStyle.SpacesInContainerLiterals = true; @@ -1984,7 +2002,7 @@ FormatStyle getWebKitStyle() { Style.ObjCSpaceAfterProperty = true; Style.PointerAlignment = FormatStyle::PAS_Left; Style.SpaceBeforeCpp11BracedList = true; - Style.SpaceInEmptyBlock = true; + Style.SpaceInEmptyBraces = FormatStyle::SIEB_Always; return Style; } diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index 4801d27b1395..a220de54f46b 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -2590,6 +2590,9 @@ private: if (!Tok.Previous || Tok.isNot(tok::identifier) || Tok.is(TT_ClassHeadName)) return false; + if (Tok.endsSequence(Keywords.kw_final, TT_ClassHeadName)) + return false; + if ((Style.isJavaScript() || Style.isJava()) && Tok.is(Keywords.kw_extends)) return false; @@ -4513,16 +4516,9 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, return Left.is(tok::hash); if (Left.isOneOf(tok::hashhash, tok::hash)) return Right.is(tok::hash); - if (Left.is(BK_Block) && Right.is(tok::r_brace) && - Right.MatchingParen == &Left && Line.Children.empty()) { - return Style.SpaceInEmptyBlock; - } if (Style.SpacesInParens == FormatStyle::SIPO_Custom) { - if ((Left.is(tok::l_paren) && Right.is(tok::r_paren)) || - (Left.is(tok::l_brace) && Left.isNot(BK_Block) && - Right.is(tok::r_brace) && Right.isNot(BK_Block))) { + if (Left.is(tok::l_paren) && Right.is(tok::r_paren)) return Style.SpacesInParensOptions.InEmptyParentheses; - } if (Style.SpacesInParensOptions.ExceptDoubleParentheses && Left.is(tok::r_paren) && Right.is(tok::r_paren)) { auto *InnerLParen = Left.MatchingParen; @@ -4800,8 +4796,6 @@ bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, Right.is(TT_ArraySubscriptLSquare))) { return false; } - if (Left.is(tok::l_brace) && Right.is(tok::r_brace)) - return !Left.Children.empty(); // No spaces in "{}". if ((Left.is(tok::l_brace) && Left.isNot(BK_Block)) || (Right.is(tok::r_brace) && Right.MatchingParen && Right.MatchingParen->isNot(BK_Block))) { @@ -4983,6 +4977,17 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Left.is(tok::star) && Right.is(tok::comment)) return true; + if (Left.is(tok::l_brace) && Right.is(tok::r_brace) && + Left.Children.empty()) { + if (Left.is(BK_Block)) + return Style.SpaceInEmptyBraces != FormatStyle::SIEB_Never; + if (Style.Cpp11BracedListStyle) { + return Style.SpacesInParens == FormatStyle::SIPO_Custom && + Style.SpacesInParensOptions.InEmptyParentheses; + } + return Style.SpaceInEmptyBraces == FormatStyle::SIEB_Always; + } + const auto *BeforeLeft = Left.Previous; if (IsCpp) { @@ -6269,7 +6274,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, } if (Right.is(tok::colon) && - !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon)) { + !Right.isOneOf(TT_CtorInitializerColon, TT_InlineASMColon, + TT_BitFieldColon)) { return false; } if (Left.is(tok::colon) && Left.isOneOf(TT_DictLiteral, TT_ObjCMethodExpr)) { diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 0adf7ee9ed54..c938ff3965f9 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -864,7 +864,8 @@ private: if (ShouldMerge()) { // We merge empty blocks even if the line exceeds the column limit. Tok->SpacesRequiredBefore = - (Style.SpaceInEmptyBlock || Line.Last->is(tok::comment)) ? 1 : 0; + Style.SpaceInEmptyBraces != FormatStyle::SIEB_Never || + Line.Last->is(tok::comment); Tok->CanBreakBefore = true; return 1; } else if (Limit != 0 && !Line.startsWithNamespace() && diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index 008a35d5265e..5980806fba5e 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -1519,6 +1519,13 @@ static void InitializePredefinedMacros(const TargetInfo &TI, if (TI.getTriple().isOSBinFormatELF()) Builder.defineMacro("__ELF__"); + if (LangOpts.Sanitize.has(SanitizerKind::Address)) + Builder.defineMacro("__SANITIZE_ADDRESS__"); + if (LangOpts.Sanitize.has(SanitizerKind::HWAddress)) + Builder.defineMacro("__SANITIZE_HWADDRESS__"); + if (LangOpts.Sanitize.has(SanitizerKind::Thread)) + Builder.defineMacro("__SANITIZE_THREAD__"); + // Target OS macro definitions. if (PPOpts.DefineTargetOSMacros) { const llvm::Triple &Triple = TI.getTriple(); diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 0d627488b1a0..c7e1c4446e85 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -1667,9 +1667,8 @@ _mm256_cvtepu32_epi64(__m128i __V) { /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [4 x i64] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mul_epi32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mul_epi32(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); } @@ -1796,9 +1795,8 @@ _mm256_mullo_epi32 (__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [8 x i32] containing one of the source operands. /// \returns A 256-bit vector of [4 x i64] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mul_epu32(__m256i __a, __m256i __b) -{ +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mul_epu32(__m256i __a, __m256i __b) { return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); } diff --git a/clang/lib/Headers/avx512bitalgintrin.h b/clang/lib/Headers/avx512bitalgintrin.h index 9a1ff8f39734..5cc32077c2c0 100644 --- a/clang/lib/Headers/avx512bitalgintrin.h +++ b/clang/lib/Headers/avx512bitalgintrin.h @@ -27,47 +27,35 @@ #endif static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi16(__m512i __A) -{ +_mm512_popcnt_epi16(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v32hu)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi) _mm512_popcnt_epi16(__B), - (__v32hi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) -{ - return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(), - __U, - __B); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512( + (__mmask32)__U, (__v32hi)_mm512_popcnt_epi16(__B), (__v32hi)__A); } static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi8(__m512i __A) -{ +_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) { + return _mm512_mask_popcnt_epi16((__m512i)_mm512_setzero_si512(), __U, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_popcnt_epi8(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v64qu)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, - (__v64qi) _mm512_popcnt_epi8(__B), - (__v64qi) __A); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512( + (__mmask64)__U, (__v64qi)_mm512_popcnt_epi8(__B), (__v64qi)__A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) -{ - return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(), - __U, - __B); +static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) { + return _mm512_mask_popcnt_epi8((__m512i)_mm512_setzero_si512(), __U, __B); } static __inline__ __mmask64 __DEFAULT_FN_ATTRS diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 05a291ecbc66..81c8e8e93449 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -1413,9 +1413,8 @@ _mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) (__v8di)_mm512_setzero_si512()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mul_epi32(__m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mul_epi32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); } @@ -1435,9 +1434,8 @@ _mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) (__v8di)_mm512_setzero_si512 ()); } -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mul_epu32(__m512i __X, __m512i __Y) -{ +static __inline __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mul_epu32(__m512i __X, __m512i __Y) { return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); } @@ -8935,36 +8933,28 @@ _mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) __W); +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)__A, + (__v8df)__W); } -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) _mm512_setzero_pd ()); +static __inline__ __m512d __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, (__v8df)__A, + (__v8df)_mm512_setzero_pd()); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) __W); +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)__A, + (__v16sf)__W); } -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) _mm512_setzero_ps ()); +static __inline__ __m512 __DEFAULT_FN_ATTRS512_CONSTEXPR +_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, (__v16sf)__A, + (__v16sf)_mm512_setzero_ps()); } static __inline__ void __DEFAULT_FN_ATTRS512 diff --git a/clang/lib/Headers/avx512vlbitalgintrin.h b/clang/lib/Headers/avx512vlbitalgintrin.h index 739e78aab753..21bf858a20c5 100644 --- a/clang/lib/Headers/avx512vlbitalgintrin.h +++ b/clang/lib/Headers/avx512vlbitalgintrin.h @@ -33,91 +33,67 @@ #endif static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_popcnt_epi16(__m256i __A) -{ +_mm256_popcnt_epi16(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v16hu)__A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, - (__v16hi) _mm256_popcnt_epi16(__B), - (__v16hi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) -{ - return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), - __U, - __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi16(__m128i __A) -{ - return (__m128i)__builtin_elementwise_popcount((__v8hu)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, - (__v8hi) _mm_popcnt_epi16(__B), - (__v8hi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) -{ - return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), - __U, - __B); +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256( + (__mmask16)__U, (__v16hi)_mm256_popcnt_epi16(__B), (__v16hi)__A); } static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR -_mm256_popcnt_epi8(__m256i __A) -{ - return (__m256i)__builtin_elementwise_popcount((__v32qu)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, - (__v32qi) _mm256_popcnt_epi8(__B), - (__v32qi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) -{ - return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), - __U, - __B); +_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) { + return _mm256_mask_popcnt_epi16((__m256i)_mm256_setzero_si256(), __U, __B); } static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi8(__m128i __A) -{ +_mm_popcnt_epi16(__m128i __A) { + return (__m128i)__builtin_elementwise_popcount((__v8hu)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128( + (__mmask8)__U, (__v8hi)_mm_popcnt_epi16(__B), (__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) { + return _mm_mask_popcnt_epi16((__m128i)_mm_setzero_si128(), __U, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_popcnt_epi8(__m256i __A) { + return (__m256i)__builtin_elementwise_popcount((__v32qu)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256( + (__mmask32)__U, (__v32qi)_mm256_popcnt_epi8(__B), (__v32qi)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) { + return _mm256_mask_popcnt_epi8((__m256i)_mm256_setzero_si256(), __U, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_popcnt_epi8(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v16qu)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, - (__v16qi) _mm_popcnt_epi8(__B), - (__v16qi) __A); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128( + (__mmask16)__U, (__v16qi)_mm_popcnt_epi8(__B), (__v16qi)__A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) -{ - return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), - __U, - __B); +static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) { + return _mm_mask_popcnt_epi8((__m128i)_mm_setzero_si128(), __U, __B); } static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 09b76d46efaf..a1f2a1c92a86 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -8264,68 +8264,52 @@ _mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) (__v4si)_mm_shuffle_epi32((A), (I)), \ (__v4si)_mm_setzero_si128())) -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) __W); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)__A, + (__v2df)__W); } -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) _mm_setzero_pd ()); +static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, (__v2df)__A, + (__v2df)_mm_setzero_pd()); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) __W); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)__A, + (__v4df)__W); } -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) _mm256_setzero_pd ()); +static __inline__ __m256d __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, (__v4df)__A, + (__v4df)_mm256_setzero_pd()); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) __W); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)__A, + (__v4sf)__W); } -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) _mm_setzero_ps ()); +static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR +_mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, (__v4sf)__A, + (__v4sf)_mm_setzero_ps()); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) __W); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)__A, + (__v8sf)__W); } -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) _mm256_setzero_ps ()); +static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR +_mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, (__v8sf)__A, + (__v8sf)_mm256_setzero_ps()); } static __inline__ __m128 __DEFAULT_FN_ATTRS128 @@ -8388,7 +8372,6 @@ _mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) #define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph #define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph - #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 #undef __DEFAULT_FN_ATTRS256_CONSTEXPR diff --git a/clang/lib/Headers/avx512vpopcntdqintrin.h b/clang/lib/Headers/avx512vpopcntdqintrin.h index 79fc6e140c61..ac71808f6980 100644 --- a/clang/lib/Headers/avx512vpopcntdqintrin.h +++ b/clang/lib/Headers/avx512vpopcntdqintrin.h @@ -16,19 +16,19 @@ #define __AVX512VPOPCNTDQINTRIN_H /* Define the default attributes for the functions in this file. */ +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vpopcntdq,evex512"), \ + __min_vector_width__(512))) constexpr +#else #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vpopcntdq,evex512"), \ __min_vector_width__(512))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr -#else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS #endif -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi64(__m512i __A) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v8du)__A); } @@ -43,8 +43,7 @@ _mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A); } -static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR -_mm512_popcnt_epi32(__m512i __A) { +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { return (__m512i)__builtin_elementwise_popcount((__v16su)__A); } @@ -60,6 +59,5 @@ _mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { } #undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/avx512vpopcntdqvlintrin.h b/clang/lib/Headers/avx512vpopcntdqvlintrin.h index d14cb1eb31f1..bed951b764cf 100644 --- a/clang/lib/Headers/avx512vpopcntdqvlintrin.h +++ b/clang/lib/Headers/avx512vpopcntdqvlintrin.h @@ -16,6 +16,17 @@ #define __AVX512VPOPCNTDQVLINTRIN_H /* Define the default attributes for the functions in this file. */ + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ + __min_vector_width__(128))) constexpr +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ + __min_vector_width__(256))) constexpr +#else #define __DEFAULT_FN_ATTRS128 \ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ @@ -24,17 +35,9 @@ __attribute__((__always_inline__, __nodebug__, \ __target__("avx512vpopcntdq,avx512vl,no-evex512"), \ __min_vector_width__(256))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr -#else -#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 -#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 #endif -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi64(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi64(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v2du)__A); } @@ -49,8 +52,7 @@ _mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR -_mm_popcnt_epi32(__m128i __A) { +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi32(__m128i __A) { return (__m128i)__builtin_elementwise_popcount((__v4su)__A); } @@ -65,7 +67,7 @@ _mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi64(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v4du)__A); } @@ -81,7 +83,7 @@ _mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); } -static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR +static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_popcnt_epi32(__m256i __A) { return (__m256i)__builtin_elementwise_popcount((__v8su)__A); } @@ -99,7 +101,5 @@ _mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { #undef __DEFAULT_FN_ATTRS128 #undef __DEFAULT_FN_ATTRS256 -#undef __DEFAULT_FN_ATTRS128_CONSTEXPR -#undef __DEFAULT_FN_ATTRS256_CONSTEXPR #endif diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 38dd462e650e..8b6b62458dac 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -62,6 +62,9 @@ typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); #define __trunc64(x) \ (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) #define __anyext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, -1, -1) @@ -2445,9 +2448,10 @@ _mm_mullo_epi16(__m128i __a, __m128i __b) { /// \param __b /// A 64-bit integer containing one of the source operands. /// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { - return __trunc64(__builtin_ia32_pmuludq128((__v4si)__anyext128(__a), - (__v4si)__anyext128(__b))); +static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_mul_su32(__m64 __a, + __m64 __b) { + return __trunc64(__builtin_ia32_pmuludq128((__v4si)__zext128(__a), + (__v4si)__zext128(__b))); } /// Multiplies 32-bit unsigned integer values contained in the lower @@ -2463,8 +2467,8 @@ static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_mul_su32(__m64 __a, __m64 __b) { /// \param __b /// A [2 x i64] vector containing one of the source operands. /// \returns A [2 x i64] vector containing the product of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mul_epu32(__m128i __a, __m128i __b) { return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); } diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h index c5075c419b70..57d0d329312a 100644 --- a/clang/lib/Headers/smmintrin.h +++ b/clang/lib/Headers/smmintrin.h @@ -567,8 +567,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1, /// A 128-bit vector of [4 x i32]. /// \returns A 128-bit vector of [2 x i64] containing the products of both /// operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1, - __m128i __V2) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mul_epi32(__m128i __V1, __m128i __V2) { return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2); } diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp index 1f695b4a8676..b282a600c0e5 100644 --- a/clang/lib/Lex/Lexer.cpp +++ b/clang/lib/Lex/Lexer.cpp @@ -174,8 +174,6 @@ void Lexer::InitLexer(const char *BufStart, const char *BufPtr, ExtendedTokenMode = 0; NewLinePtr = nullptr; - - IsFirstPPToken = true; } /// Lexer constructor - Create a new lexer object for the specified buffer @@ -3225,7 +3223,6 @@ std::optional Lexer::peekNextPPToken() { bool atStartOfLine = IsAtStartOfLine; bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; bool leadingSpace = HasLeadingSpace; - bool isFirstPPToken = IsFirstPPToken; Token Tok; Lex(Tok); @@ -3236,7 +3233,6 @@ std::optional Lexer::peekNextPPToken() { HasLeadingSpace = leadingSpace; IsAtStartOfLine = atStartOfLine; IsAtPhysicalStartOfLine = atPhysicalStartOfLine; - IsFirstPPToken = isFirstPPToken; // Restore the lexer back to non-skipping mode. LexingRawMode = false; @@ -3726,11 +3722,6 @@ bool Lexer::Lex(Token &Result) { HasLeadingEmptyMacro = false; } - if (IsFirstPPToken) { - Result.setFlag(Token::FirstPPToken); - IsFirstPPToken = false; - } - bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; IsAtPhysicalStartOfLine = false; bool isRawLex = isLexingRawMode(); diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index e278846f6f36..e003ad3a9557 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -43,6 +43,7 @@ #include "clang/Lex/MacroArgs.h" #include "clang/Lex/MacroInfo.h" #include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/NoTrivialPPDirectiveTracer.h" #include "clang/Lex/Pragma.h" #include "clang/Lex/PreprocessingRecord.h" #include "clang/Lex/PreprocessorLexer.h" @@ -247,8 +248,6 @@ void Preprocessor::DumpToken(const Token &Tok, bool DumpFlags) const { llvm::errs() << " [LeadingSpace]"; if (Tok.isExpandDisabled()) llvm::errs() << " [ExpandDisabled]"; - if (Tok.isFirstPPToken()) - llvm::errs() << " [First pp-token]"; if (Tok.needsCleaning()) { const char *Start = SourceMgr.getCharacterData(Tok.getLocation()); llvm::errs() << " [UnClean='" << StringRef(Start, Tok.getLength()) @@ -577,8 +576,11 @@ void Preprocessor::EnterMainSourceFile() { // export module M; // error: module declaration must occur // // at the start of the translation unit. if (getLangOpts().CPlusPlusModules) { + auto Tracer = std::make_unique(*this); + DirTracer = Tracer.get(); + addPPCallbacks(std::move(Tracer)); std::optional FirstPPTok = CurLexer->peekNextPPToken(); - if (FirstPPTok && FirstPPTok->isFirstPPToken()) + if (FirstPPTok) FirstPPTokenLoc = FirstPPTok->getLocation(); } } @@ -940,6 +942,8 @@ void Preprocessor::Lex(Token &Result) { StdCXXImportSeqState.handleHeaderName(); break; case tok::kw_export: + if (hasSeenNoTrivialPPDirective()) + Result.setFlag(Token::HasSeenNoTrivialPPDirective); TrackGMFState.handleExport(); StdCXXImportSeqState.handleExport(); ModuleDeclState.handleExport(); @@ -968,6 +972,8 @@ void Preprocessor::Lex(Token &Result) { } break; } else if (Result.getIdentifierInfo() == getIdentifierInfo("module")) { + if (hasSeenNoTrivialPPDirective()) + Result.setFlag(Token::HasSeenNoTrivialPPDirective); TrackGMFState.handleModule(StdCXXImportSeqState.afterTopLevelSeq()); ModuleDeclState.handleModule(); break; @@ -1682,3 +1688,31 @@ const char *Preprocessor::getCheckPoint(FileID FID, const char *Start) const { return nullptr; } + +bool Preprocessor::hasSeenNoTrivialPPDirective() const { + return DirTracer && DirTracer->hasSeenNoTrivialPPDirective(); +} + +bool NoTrivialPPDirectiveTracer::hasSeenNoTrivialPPDirective() const { + return SeenNoTrivialPPDirective; +} + +void NoTrivialPPDirectiveTracer::setSeenNoTrivialPPDirective() { + if (InMainFile && !SeenNoTrivialPPDirective) + SeenNoTrivialPPDirective = true; +} + +void NoTrivialPPDirectiveTracer::LexedFileChanged( + FileID FID, LexedFileChangeReason Reason, + SrcMgr::CharacteristicKind FileType, FileID PrevFID, SourceLocation Loc) { + InMainFile = (FID == PP.getSourceManager().getMainFileID()); +} + +void NoTrivialPPDirectiveTracer::MacroExpands(const Token &MacroNameTok, + const MacroDefinition &MD, + SourceRange Range, + const MacroArgs *Args) { + // FIXME: Does only enable builtin macro expansion make sense? + if (!MD.getMacroInfo()->isBuiltinMacro()) + setSeenNoTrivialPPDirective(); +} diff --git a/clang/lib/Parse/ParseHLSLRootSignature.cpp b/clang/lib/Parse/ParseHLSLRootSignature.cpp index 98dc458f7adc..5490c61f5235 100644 --- a/clang/lib/Parse/ParseHLSLRootSignature.cpp +++ b/clang/lib/Parse/ParseHLSLRootSignature.cpp @@ -234,15 +234,15 @@ std::optional RootSignatureParser::parseRootDescriptor() { default: llvm_unreachable("Switch for consumed token was not provided"); case TokenKind::kw_CBV: - Descriptor.Type = DescriptorType::CBuffer; + Descriptor.Type = ResourceClass::CBuffer; ExpectedReg = TokenKind::bReg; break; case TokenKind::kw_SRV: - Descriptor.Type = DescriptorType::SRV; + Descriptor.Type = ResourceClass::SRV; ExpectedReg = TokenKind::tReg; break; case TokenKind::kw_UAV: - Descriptor.Type = DescriptorType::UAV; + Descriptor.Type = ResourceClass::UAV; ExpectedReg = TokenKind::uReg; break; } @@ -360,19 +360,19 @@ RootSignatureParser::parseDescriptorTableClause() { default: llvm_unreachable("Switch for consumed token was not provided"); case TokenKind::kw_CBV: - Clause.Type = ClauseType::CBuffer; + Clause.Type = ResourceClass::CBuffer; ExpectedReg = TokenKind::bReg; break; case TokenKind::kw_SRV: - Clause.Type = ClauseType::SRV; + Clause.Type = ResourceClass::SRV; ExpectedReg = TokenKind::tReg; break; case TokenKind::kw_UAV: - Clause.Type = ClauseType::UAV; + Clause.Type = ResourceClass::UAV; ExpectedReg = TokenKind::uReg; break; case TokenKind::kw_Sampler: - Clause.Type = ClauseType::Sampler; + Clause.Type = ResourceClass::Sampler; ExpectedReg = TokenKind::sReg; break; } diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 7d190ea99dba..a17398b84c6a 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -2363,9 +2363,10 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) { // Parse a global-module-fragment, if present. if (getLangOpts().CPlusPlusModules && Tok.is(tok::semi)) { SourceLocation SemiLoc = ConsumeToken(); - if (!Introducer.isFirstPPToken()) { + if (ImportState != Sema::ModuleImportState::FirstDecl || + Introducer.hasSeenNoTrivialPPDirective()) { Diag(StartLoc, diag::err_global_module_introducer_not_at_start) - << SourceRange(StartLoc, SemiLoc); + << SourceRange(StartLoc, SemiLoc); return nullptr; } if (MDK == Sema::ModuleDeclKind::Interface) { @@ -2420,7 +2421,8 @@ Parser::ParseModuleDecl(Sema::ModuleImportState &ImportState) { ExpectAndConsumeSemi(diag::err_module_expected_semi); return Actions.ActOnModuleDecl(StartLoc, ModuleLoc, MDK, Path, Partition, - ImportState, Introducer.isFirstPPToken()); + ImportState, + Introducer.hasSeenNoTrivialPPDirective()); } Decl *Parser::ParseModuleImport(SourceLocation AtLoc, diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index dd418f71861d..0b94b1044f07 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -2780,6 +2780,31 @@ public: } }; +namespace clang::lifetimes { +namespace { +class LifetimeSafetyReporterImpl : public LifetimeSafetyReporter { + +public: + LifetimeSafetyReporterImpl(Sema &S) : S(S) {} + + void reportUseAfterFree(const Expr *IssueExpr, const Expr *UseExpr, + SourceLocation FreeLoc, Confidence C) override { + S.Diag(IssueExpr->getExprLoc(), + C == Confidence::Definite + ? diag::warn_lifetime_safety_loan_expires_permissive + : diag::warn_lifetime_safety_loan_expires_strict) + << IssueExpr->getEndLoc(); + S.Diag(FreeLoc, diag::note_lifetime_safety_destroyed_here); + S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here) + << UseExpr->getEndLoc(); + } + +private: + Sema &S; +}; +} // namespace +} // namespace clang::lifetimes + void clang::sema::AnalysisBasedWarnings::IssueWarnings( TranslationUnitDecl *TU) { if (!TU) @@ -3029,8 +3054,10 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings( // TODO: Enable lifetime safety analysis for other languages once it is // stable. if (EnableLifetimeSafetyAnalysis && S.getLangOpts().CPlusPlus) { - if (AC.getCFG()) - lifetimes::runLifetimeSafetyAnalysis(AC); + if (AC.getCFG()) { + lifetimes::LifetimeSafetyReporterImpl LifetimeSafetyReporter(S); + lifetimes::runLifetimeSafetyAnalysis(AC, &LifetimeSafetyReporter); + } } // Check for violations of "called once" parameter properties. if (S.getLangOpts().ObjC && !S.getLangOpts().CPlusPlus && diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index 5001e080f946..8ddbaf34a7f4 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -20270,9 +20270,10 @@ Decl *Sema::ActOnEnumConstant(Scope *S, Decl *theEnumDecl, Decl *lastEnumConst, // different from T: // - every enumerator of every member of class T that is an unscoped // enumerated type - if (getLangOpts().CPlusPlus && !TheEnumDecl->isScoped()) - DiagnoseClassNameShadow(TheEnumDecl->getDeclContext(), - DeclarationNameInfo(Id, IdLoc)); + if (getLangOpts().CPlusPlus && !TheEnumDecl->isScoped() && + DiagnoseClassNameShadow(TheEnumDecl->getDeclContext(), + DeclarationNameInfo(Id, IdLoc))) + return nullptr; EnumConstantDecl *New = CheckEnumConstant(TheEnumDecl, LastEnumConst, IdLoc, Id, Val); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index dd66a5f15a97..30930d9cf48c 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -1373,10 +1373,13 @@ static bool checkTupleLikeDecomposition(Sema &S, S.BuildReferenceType(T, E.get()->isLValue(), Loc, B->getDeclName()); if (RefType.isNull()) return true; - auto *RefVD = VarDecl::Create( - S.Context, Src->getDeclContext(), Loc, Loc, - B->getDeclName().getAsIdentifierInfo(), RefType, - S.Context.getTrivialTypeSourceInfo(T, Loc), Src->getStorageClass()); + + // Don't give this VarDecl a TypeSourceInfo, since this is a synthesized + // entity and this type was never written in source code. + auto *RefVD = + VarDecl::Create(S.Context, Src->getDeclContext(), Loc, Loc, + B->getDeclName().getAsIdentifierInfo(), RefType, + /*TInfo=*/nullptr, Src->getStorageClass()); RefVD->setLexicalDeclContext(Src->getLexicalDeclContext()); RefVD->setTSCSpec(Src->getTSCSpec()); RefVD->setImplicit(); diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index d7cca4bc65d2..60f9d449fc03 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -3294,8 +3294,9 @@ InitListChecker::CheckDesignatedInitializer(const InitializedEntity &Entity, if (StringLiteral *SL = dyn_cast(SubExpr)) { // Get the length of the string. uint64_t StrLen = SL->getLength(); - if (cast(AT)->getSize().ult(StrLen)) - StrLen = cast(AT)->getZExtSize(); + if (const auto *CAT = dyn_cast(AT); + CAT && CAT->getSize().ult(StrLen)) + StrLen = CAT->getZExtSize(); StructuredList->resizeInits(Context, StrLen); // Build a literal for each character in the string, and put them into @@ -3317,8 +3318,9 @@ InitListChecker::CheckDesignatedInitializer(const InitializedEntity &Entity, // Get the length of the string. uint64_t StrLen = Str.size(); - if (cast(AT)->getSize().ult(StrLen)) - StrLen = cast(AT)->getZExtSize(); + if (const auto *CAT = dyn_cast(AT); + CAT && CAT->getSize().ult(StrLen)) + StrLen = CAT->getZExtSize(); StructuredList->resizeInits(Context, StrLen); // Build a literal for each character in the string, and put them into diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp index ff9f85f960d9..1ecc5c747695 100644 --- a/clang/lib/Sema/SemaModule.cpp +++ b/clang/lib/Sema/SemaModule.cpp @@ -265,10 +265,11 @@ Sema::DeclGroupPtrTy Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, ModuleDeclKind MDK, ModuleIdPath Path, ModuleIdPath Partition, ModuleImportState &ImportState, - bool IntroducerIsFirstPPToken) { + bool SeenNoTrivialPPDirective) { assert(getLangOpts().CPlusPlusModules && "should only have module decl in standard C++ modules"); + bool IsFirstDecl = ImportState == ModuleImportState::FirstDecl; bool SeenGMF = ImportState == ModuleImportState::GlobalFragment; // If any of the steps here fail, we count that as invalidating C++20 // module state; @@ -336,7 +337,8 @@ Sema::ActOnModuleDecl(SourceLocation StartLoc, SourceLocation ModuleLoc, // In C++20, A module directive may only appear as the first preprocessing // tokens in a file (excluding the global module fragment.). - if (getLangOpts().CPlusPlusModules && !IntroducerIsFirstPPToken && !SeenGMF) { + if (getLangOpts().CPlusPlusModules && + (!IsFirstDecl || SeenNoTrivialPPDirective) && !SeenGMF) { Diag(ModuleLoc, diag::err_module_decl_not_at_start); SourceLocation BeginLoc = PP.getMainFileFirstPPTokenLoc(); Diag(BeginLoc, diag::note_global_module_introducer_missing) diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index c2af456224be..07713992da35 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -1921,8 +1921,13 @@ void SemaOpenACC::ActOnVariableDeclarator(VarDecl *VD) { return; // This cast should be safe, since a static-local can only happen in a - // function declaration. - auto *ContextDecl = cast(getCurContext()); + // function declaration. However, in error cases (or perhaps ObjC/C++?), this + // could possibly be something like a 'block' decl, so if this is NOT a + // function decl, just give up. + auto *ContextDecl = dyn_cast(getCurContext()); + + if (!ContextDecl) + return; // OpenACC 3.3 2.15: // In C and C++, function static variables are not supported in functions to @@ -2674,7 +2679,10 @@ SemaOpenACC::CreateInitRecipe(OpenACCClauseKind CK, const Expr *VarExpr) { // DeclRefExpr). auto *Idx = IntegerLiteral::Create( - getASTContext(), llvm::APInt(sizeof(std::size_t) * 8, I), + getASTContext(), + llvm::APInt( + getASTContext().getTypeSize(getASTContext().getSizeType()), + I), getASTContext().getSizeType(), VarExpr->getBeginLoc()); Expr *Subscript = new (getASTContext()) ArraySubscriptExpr( diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index f02a295220ef..6b423ce06523 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -5669,7 +5669,7 @@ void Sema::InstantiateFunctionDefinition(SourceLocation PointOfInstantiation, }; Function->setDeclarationNameLoc(NameLocPointsToPattern()); - EnterExpressionEvaluationContext EvalContext( + EnterExpressionEvaluationContextForFunction EvalContext( *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated); Qualifiers ThisTypeQuals; diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 1863e7f97e3f..055d3cd1a860 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -7668,8 +7668,11 @@ QualType TreeTransform::TransformDependentNameType( } else if (isa(Result)) { TLB.push(Result).set(TL.getElaboratedKeywordLoc(), QualifierLoc, TL.getNameLoc()); + } else if (isa(Result)) { + auto NewTL = TLB.push(Result); + NewTL.set(TL.getElaboratedKeywordLoc(), QualifierLoc, TL.getNameLoc()); } else { - DependentNameTypeLoc NewTL = TLB.push(Result); + auto NewTL = TLB.push(Result); NewTL.setElaboratedKeywordLoc(TL.getElaboratedKeywordLoc()); NewTL.setQualifierLoc(QualifierLoc); NewTL.setNameLoc(TL.getNameLoc()); diff --git a/clang/test/AST/ByteCode/builtin-functions.cpp b/clang/test/AST/ByteCode/builtin-functions.cpp index 1223cf8bdc74..3277ef65a880 100644 --- a/clang/test/AST/ByteCode/builtin-functions.cpp +++ b/clang/test/AST/ByteCode/builtin-functions.cpp @@ -21,6 +21,27 @@ #error "huh?" #endif + +inline constexpr void* operator new(__SIZE_TYPE__, void* p) noexcept { return p; } +namespace std { + using size_t = decltype(sizeof(0)); + template struct allocator { + constexpr T *allocate(size_t N) { + return (T*)__builtin_operator_new(sizeof(T) * N); // #alloc + } + constexpr void deallocate(void *p, __SIZE_TYPE__) { + __builtin_operator_delete(p); + } + }; +template +constexpr T* construct_at(T* p, Args&&... args) { return ::new((void*)p) T(static_cast(args)...); } + + template + constexpr void destroy_at(T* p) { + p->~T(); + } +} + extern "C" { typedef decltype(sizeof(int)) size_t; extern size_t wcslen(const wchar_t *p); @@ -1767,6 +1788,30 @@ namespace WithinLifetime { } } xstd; // both-error {{is not a constant expression}} \ // both-note {{in call to}} + + /// FIXME: We do not have per-element lifetime information for primitive arrays. + /// See https://github.com/llvm/llvm-project/issues/147528 + consteval bool test_dynamic(bool read_after_deallocate) { + std::allocator a; + int* p = a.allocate(1); // expected-note 2{{allocation performed here was not deallocated}} + // a.allocate starts the lifetime of an array, + // the complete object of *p has started its lifetime + if (__builtin_is_within_lifetime(p)) + return false; + std::construct_at(p); + if (!__builtin_is_within_lifetime(p)) + return false; + std::destroy_at(p); + if (__builtin_is_within_lifetime(p)) + return false; + a.deallocate(p, 1); + if (read_after_deallocate) + __builtin_is_within_lifetime(p); // ref-note {{read of heap allocated object that has been deleted}} + return true; + } + static_assert(test_dynamic(false)); // expected-error {{not an integral constant expression}} + static_assert(test_dynamic(true)); // both-error {{not an integral constant expression}} \ + // ref-note {{in call to}} } #ifdef __SIZEOF_INT128__ diff --git a/clang/test/AST/ByteCode/builtin-object-size-codegen.cpp b/clang/test/AST/ByteCode/builtin-object-size-codegen.cpp new file mode 100644 index 000000000000..f6ddbeb12b7a --- /dev/null +++ b/clang/test/AST/ByteCode/builtin-object-size-codegen.cpp @@ -0,0 +1,87 @@ +// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck %s + +void foo() { + struct A { char buf[16]; }; + struct B : A {}; + struct C { int i; B bs[1]; } *c; + + int gi; + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0], 0); + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0], 1); + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0], 2); + // CHECK: store i32 16 + gi = __builtin_object_size(&c->bs[0], 3); +} + + +void foo2() { + struct A { int a; }; + struct B { int b; }; + struct C: public A, public B {}; + + C c; + + int gi; + // CHECK: store i32 8 + gi = __builtin_object_size(&c, 0); + // CHECK: store i32 8 + gi = __builtin_object_size((A*)&c, 0); + // CHECK: store i32 4 + gi = __builtin_object_size((B*)&c, 0); + + // CHECK: store i32 8 + gi = __builtin_object_size((char*)&c, 0); + // CHECK: store i32 8 + gi = __builtin_object_size((char*)(A*)&c, 0); + // CHECK: store i32 4 + gi = __builtin_object_size((char*)(B*)&c, 0); +} + + +typedef struct { + double c[0]; + float f; +} foofoo0_t; + +unsigned babar0(foofoo0_t *f) { + // CHECK: ret i32 0 + return __builtin_object_size(f->c, 1); +} + +void test2() { + struct A { char buf[16]; }; + struct B : A {}; + struct C { int i; B bs[1]; } *c; + + int gi; + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0], 0); + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0], 1); + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0], 2); + // CHECK: store i32 16 + gi = __builtin_object_size(&c->bs[0], 3); + + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false) + gi = __builtin_object_size((A*)&c->bs[0], 0); + // CHECK: store i32 16 + gi = __builtin_object_size((A*)&c->bs[0], 1); + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0].buf[0], 2); + // CHECK: store i32 16 + gi = __builtin_object_size(&c->bs[0].buf[0], 3); + + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 false, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0].buf[0], 0); + // CHECK: store i32 16 + gi = __builtin_object_size(&c->bs[0].buf[0], 1); + // CHECK: call i64 @llvm.objectsize.i64.p0(ptr %{{.*}}, i1 true, i1 true, i1 false) + gi = __builtin_object_size(&c->bs[0].buf[0], 2); + // CHECK: store i32 16 + gi = __builtin_object_size(&c->bs[0].buf[0], 3); +} diff --git a/clang/test/AST/ByteCode/c.c b/clang/test/AST/ByteCode/c.c index a7b1fe07f6d8..654b3da2b7d6 100644 --- a/clang/test/AST/ByteCode/c.c +++ b/clang/test/AST/ByteCode/c.c @@ -329,3 +329,12 @@ void foo3 (void) void* x = 0; void* y = &*x; } + +static void *FooTable[1] = { + [0] = (void *[1]) { // 1 + [0] = (void *[1]) { // 2 + [0] = (void *[1]) {} // pedantic-warning {{use of an empty initializer}} + }, + } +}; + diff --git a/clang/test/AST/ByteCode/functions.cpp b/clang/test/AST/ByteCode/functions.cpp index 3c00de0102e5..4f090842510e 100644 --- a/clang/test/AST/ByteCode/functions.cpp +++ b/clang/test/AST/ByteCode/functions.cpp @@ -713,3 +713,22 @@ namespace EnableIfWithTemporary { struct A { ~A(); }; int &h() __attribute__((enable_if((A(), true), ""))); // both-warning {{clang extension}} } + +namespace LocalVarForParmVarDecl { + struct Iter { + void *p; + }; + constexpr bool bar2(Iter A) { + return true; + } + constexpr bool bar(Iter A, bool b) { + if (b) + return true; + + return bar(A, true); + } + constexpr int foo() { + return bar(Iter(), false); + } + static_assert(foo(), ""); +} diff --git a/clang/test/AST/ByteCode/lifetimes26.cpp b/clang/test/AST/ByteCode/lifetimes26.cpp index a5203ae77bc1..c3163f8a562b 100644 --- a/clang/test/AST/ByteCode/lifetimes26.cpp +++ b/clang/test/AST/ByteCode/lifetimes26.cpp @@ -17,8 +17,8 @@ namespace std { constexpr void *operator new(std::size_t, void *p) { return p; } namespace std { - template constexpr T *construct(T *p) { return new (p) T; } - template constexpr void destroy(T *p) { p->~T(); } + template constexpr T *construct_at(T *p) { return new (p) T; } + template constexpr void destroy_at(T *p) { p->~T(); } } constexpr bool foo() { @@ -43,7 +43,24 @@ constexpr void destroy_pointer() { using T = int*; T p; p.~T(); - std::construct(&p); + std::construct_at(&p); } static_assert((destroy_pointer(), true)); + +namespace DestroyArrayElem { + /// This is proof that std::destroy_at'ing an array element + /// ends the lifetime of the entire array. + /// See https://github.com/llvm/llvm-project/issues/147528 + /// Using destroy_at on array elements is currently a no-op due to this. + constexpr int test() { + int a[4] = {}; + + std::destroy_at(&a[3]); + int r = a[1]; + std::construct_at(&a[3]); + + return r; + } + static_assert(test() == 0); +} diff --git a/clang/test/AST/ast-dump-comment.cpp b/clang/test/AST/ast-dump-comment.cpp index b67f79916d96..b5dbe2e317d8 100644 --- a/clang/test/AST/ast-dump-comment.cpp +++ b/clang/test/AST/ast-dump-comment.cpp @@ -132,8 +132,50 @@ void Test_TemplatedFunctionVariadic(int arg, ...); // CHECK-NEXT: ParagraphComment // CHECK-NEXT: TextComment{{.*}} Text=" More arguments" +/// \param[out] Aaa Short summary +int Test_HTMLSummaryTag(int Aaa); +// CHECK: FunctionDecl{{.*}}Test_HTMLSummaryTag +// CHECK: ParamCommandComment{{.*}} [out] explicitly Param="Aaa" +// CHECK-NEXT: ParagraphComment +// CHECK: HTMLStartTagComment{{.*}} Name="summary" +// CHECK-NEXT: TextComment{{.*}} Text="Short summary" +// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="summary" + /// \thread_safe test for underscore in special command int Test_UnderscoreInSpecialCommand; // CHECK: VarDecl{{.*}}Test_UnderscoreInSpecialCommand 'int' // CHECK: InlineCommandComment{{.*}} Name="thread_safe" RenderNormal // CHECK-NEXT: TextComment{{.*}} Text=" test for underscore in special command" + +///
+/// +/// Summary +/// +///

Details

+///
+/// +/// Some highlighting +/// +///
+/// +///
Figure 1
+///
+int Test_AdditionalHTMLTags(int Aaa); +// CHECK: FunctionDecl{{.*}}Test_AdditionalHTMLTags 'int (int)' +// CHECK: HTMLStartTagComment{{.*}} Name="details" +// CHECK: HTMLStartTagComment{{.*}} Name="summary" +// CHECK-NEXT: TextComment{{.*}} Text=" Summary" +// CHECK: HTMLEndTagComment{{.*}} Name="summary" +// CHECK: HTMLStartTagComment{{.*}} Name="p" +// CHECK-NEXT: TextComment{{.*}} Text="Details" +// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="p" +// CHECK: HTMLEndTagComment{{.*}} Name="details" +// CHECK: HTMLStartTagComment{{.*}} Name="mark" +// CHECK-NEXT: TextComment{{.*}} Text="highlighting" +// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="mark" +// CHECK: HTMLStartTagComment{{.*}} Name="figure" +// CHECK: HTMLStartTagComment{{.*}} Name="img" Attrs: "src="pic.jpg" +// CHECK: HTMLStartTagComment{{.*}} Name="figcaption" +// CHECK-NEXT: TextComment{{.*}} Text="Figure 1" +// CHECK-NEXT: HTMLEndTagComment{{.*}} Name="figcaption" +// CHECK: HTMLEndTagComment{{.*}} Name="figure" diff --git a/clang/test/Analysis/anonymous-decls.cpp b/clang/test/Analysis/anonymous-decls.cpp index 3f972a33aa62..76e5155b61b6 100644 --- a/clang/test/Analysis/anonymous-decls.cpp +++ b/clang/test/Analysis/anonymous-decls.cpp @@ -78,12 +78,12 @@ int main() { // CHECK-NEXT: 8: decomposition-a-b // CHECK-NEXT: 9: [B3.7]([B3.8]) // CHECK-NEXT: 10: [B3.9] -// CHECK-NEXT: 11: std::tuple_element<0UL, std::pair>::type a = get<0UL>(decomposition-a-b); +// CHECK-NEXT: 11: std::tuple_element<0UL, std::pair>::type &&a = get<0UL>(decomposition-a-b); // CHECK-NEXT: 12: get<1UL> // CHECK-NEXT: 13: [B3.12] (ImplicitCastExpr, FunctionToPointerDecay, tuple_element<1L, pair >::type (*)(pair &)) // CHECK-NEXT: 14: decomposition-a-b // CHECK-NEXT: 15: [B3.13]([B3.14]) // CHECK-NEXT: 16: [B3.15] -// CHECK-NEXT: 17: std::tuple_element<1UL, std::pair>::type b = get<1UL>(decomposition-a-b); +// CHECK-NEXT: 17: std::tuple_element<1UL, std::pair>::type &&b = get<1UL>(decomposition-a-b); // CHECK-NEXT: Preds (1): B1 // CHECK-NEXT: Succs (1): B2 diff --git a/clang/test/CIR/CodeGen/builtins.cpp b/clang/test/CIR/CodeGen/builtins.cpp index 3d43821af4e5..0e434809fe6b 100644 --- a/clang/test/CIR/CodeGen/builtins.cpp +++ b/clang/test/CIR/CodeGen/builtins.cpp @@ -12,3 +12,31 @@ double fabs(double x) { // CIR: {{.*}} = cir.fabs {{.*}} : !cir.double // LLVM: {{.*}} = call double @llvm.fabs.f64(double {{.*}}) // OGCG: {{.*}} = call double @llvm.fabs.f64(double {{.*}}) + +extern "C" void *test_return_address(void) { + return __builtin_return_address(1); + + // CIR-LABEL: test_return_address + // CIR: [[ARG:%.*]] = cir.const #cir.int<1> : !u32i + // CIR: {{%.*}} = cir.return_address([[ARG]]) + + // LLVM-LABEL: @test_return_address + // LLVM: {{%.*}} = call ptr @llvm.returnaddress(i32 1) + + // OGCG-LABEL: @test_return_address + // OGCG: {{%.*}} = call ptr @llvm.returnaddress(i32 1) +} + +extern "C" void *test_frame_address(void) { + return __builtin_frame_address(1); + + // CIR-LABEL: test_frame_address + // CIR: [[ARG:%.*]] = cir.const #cir.int<1> : !u32i + // CIR: {{%.*}} = cir.frame_address([[ARG]]) + + // LLVM-LABEL: @test_frame_address + // LLVM: {{%.*}} = call ptr @llvm.frameaddress.p0(i32 1) + + // OGCG-LABEL: @test_frame_address + // OGCG: {{%.*}} = call ptr @llvm.frameaddress.p0(i32 1) +} diff --git a/clang/test/CIR/CodeGen/goto.cpp b/clang/test/CIR/CodeGen/goto.cpp new file mode 100644 index 000000000000..13ca65344a15 --- /dev/null +++ b/clang/test/CIR/CodeGen/goto.cpp @@ -0,0 +1,210 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG + +int shouldNotGenBranchRet(int x) { + if (x > 5) + goto err; + return 0; +err: + return -1; +} +// CIR: cir.func dso_local @_Z21shouldNotGenBranchReti +// CIR: cir.if {{.*}} { +// CIR: cir.goto "err" +// CIR: } +// CIR: [[ZERO:%.*]] = cir.const #cir.int<0> : !s32i +// CIR: cir.store [[ZERO]], [[RETVAL:%.*]] : !s32i, !cir.ptr +// CIR: cir.br ^bb1 +// CIR: ^bb1: +// CIR: [[RET:%.*]] = cir.load [[RETVAL]] : !cir.ptr, !s32i +// CIR: cir.return [[RET]] : !s32i +// CIR: ^bb2: +// CIR: cir.label "err" +// CIR: [[ONE:%.*]] = cir.const #cir.int<1> : !s32i +// CIR: [[MINUS:%.*]] = cir.unary(minus, [[ONE]]) nsw : !s32i, !s32i +// CIR: cir.store [[MINUS]], [[RETVAL]] : !s32i, !cir.ptr +// CIR: cir.br ^bb1 + +// OGCG: define dso_local noundef i32 @_Z21shouldNotGenBranchReti +// OGCG: if.then: +// OGCG: br label %err +// OGCG: if.end: +// OGCG: br label %return +// OGCG: err: +// OGCG: br label %return +// OGCG: return: + +int shouldGenBranch(int x) { + if (x > 5) + goto err; + x++; +err: + return -1; +} +// CIR: cir.func dso_local @_Z15shouldGenBranchi +// CIR: cir.if {{.*}} { +// CIR: cir.goto "err" +// CIR: } +// CIR: cir.br ^bb1 +// CIR: ^bb1: +// CIR: cir.label "err" + +// OGCG: define dso_local noundef i32 @_Z15shouldGenBranchi +// OGCG: if.then: +// OGCG: br label %err +// OGCG: if.end: +// OGCG: br label %err +// OGCG: err: +// OGCG: ret + +void severalLabelsInARow(int a) { + int b = a; + goto end1; + b = b + 1; + goto end2; +end1: +end2: + b = b + 2; +} +// CIR: cir.func dso_local @_Z19severalLabelsInARowi +// CIR: cir.goto "end1" +// CIR: ^bb[[#BLK1:]] +// CIR: cir.goto "end2" +// CIR: ^bb[[#BLK2:]]: +// CIR: cir.label "end1" +// CIR: cir.br ^bb[[#BLK3:]] +// CIR: ^bb[[#BLK3]]: +// CIR: cir.label "end2" + +// OGCG: define dso_local void @_Z19severalLabelsInARowi +// OGCG: br label %end1 +// OGCG: end1: +// OGCG: br label %end2 +// OGCG: end2: +// OGCG: ret + +void severalGotosInARow(int a) { + int b = a; + goto end; + goto end; +end: + b = b + 2; +} +// CIR: cir.func dso_local @_Z18severalGotosInARowi +// CIR: cir.goto "end" +// CIR: ^bb[[#BLK1:]]: +// CIR: cir.goto "end" +// CIR: ^bb[[#BLK2:]]: +// CIR: cir.label "end" + +// OGCG: define dso_local void @_Z18severalGotosInARowi(i32 noundef %a) #0 { +// OGCG: br label %end +// OGCG: end: +// OGCG: ret void + +extern "C" void action1(); +extern "C" void action2(); +extern "C" void multiple_non_case(int v) { + switch (v) { + default: + action1(); + l2: + action2(); + break; + } +} + +// CIR: cir.func dso_local @multiple_non_case +// CIR: cir.switch +// CIR: cir.case(default, []) { +// CIR: cir.call @action1() +// CIR: cir.br ^[[BB1:[a-zA-Z0-9]+]] +// CIR: ^[[BB1]]: +// CIR: cir.label +// CIR: cir.call @action2() +// CIR: cir.break + +// OGCG: define dso_local void @multiple_non_case +// OGCG: sw.default: +// OGCG: call void @action1() +// OGCG: br label %l2 +// OGCG: l2: +// OGCG: call void @action2() +// OGCG: br label [[BREAK:%.*]] + +extern "C" void case_follow_label(int v) { + switch (v) { + case 1: + label: + case 2: + action1(); + break; + default: + action2(); + goto label; + } +} + +// CIR: cir.func dso_local @case_follow_label +// CIR: cir.switch +// CIR: cir.case(equal, [#cir.int<1> : !s32i]) { +// CIR: cir.label "label" +// CIR: cir.case(equal, [#cir.int<2> : !s32i]) { +// CIR: cir.call @action1() +// CIR: cir.break +// CIR: cir.case(default, []) { +// CIR: cir.call @action2() +// CIR: cir.goto "label" + +// OGCG: define dso_local void @case_follow_label +// OGCG: sw.bb: +// OGCG: br label %label +// OGCG: label: +// OGCG: br label %sw.bb1 +// OGCG: sw.bb1: +// OGCG: call void @action1() +// OGCG: br label %sw.epilog +// OGCG: sw.default: +// OGCG: call void @action2() +// OGCG: br label %label +// OGCG: sw.epilog: +// OGCG: ret void + +extern "C" void default_follow_label(int v) { + switch (v) { + case 1: + case 2: + action1(); + break; + label: + default: + action2(); + goto label; + } +} + +// CIR: cir.func dso_local @default_follow_label +// CIR: cir.switch +// CIR: cir.case(equal, [#cir.int<1> : !s32i]) { +// CIR: cir.yield +// CIR: cir.case(equal, [#cir.int<2> : !s32i]) { +// CIR: cir.call @action1() +// CIR: cir.break +// CIR: cir.label "label" +// CIR: cir.case(default, []) { +// CIR: cir.call @action2() +// CIR: cir.goto "label" + +// OGCG: define dso_local void @default_follow_label +// OGCG: sw.bb: +// OGCG: call void @action1() +// OGCG: br label %sw.epilog +// OGCG: label: +// OGCG: br label %sw.default +// OGCG: sw.default: +// OGCG: call void @action2() +// OGCG: br label %label +// OGCG: sw.epilog: +// OGCG: ret void diff --git a/clang/test/CIR/CodeGen/label.c b/clang/test/CIR/CodeGen/label.c index 2a515fc4046e..797c44475a62 100644 --- a/clang/test/CIR/CodeGen/label.c +++ b/clang/test/CIR/CodeGen/label.c @@ -101,3 +101,39 @@ void after_unreachable() { // OGCG: unreachable // OGCG: label: // OGCG: ret void + +void labelWithoutMatch() { +end: + return; +} +// CIR: cir.func no_proto dso_local @labelWithoutMatch +// CIR: cir.label "end" +// CIR: cir.return +// CIR: } + +// OGCG: define dso_local void @labelWithoutMatch +// OGCG: br label %end +// OGCG: end: +// OGCG: ret void + +struct S {}; +struct S get(); +void bar(struct S); + +void foo() { + { + label: + bar(get()); + } +} + +// CIR: cir.func no_proto dso_local @foo +// CIR: cir.scope { +// CIR: cir.label "label" +// CIR: %0 = cir.alloca !rec_S, !cir.ptr, ["agg.tmp0"] + +// OGCG: define dso_local void @foo() +// OGCG: %agg.tmp = alloca %struct.S, align 1 +// OGCG: %undef.agg.tmp = alloca %struct.S, align 1 +// OGCG: br label %label +// OGCG: label: diff --git a/clang/test/CIR/CodeGen/virtual-function-calls.cpp b/clang/test/CIR/CodeGen/virtual-function-calls.cpp index 004b6dab3056..4787d78aa0e3 100644 --- a/clang/test/CIR/CodeGen/virtual-function-calls.cpp +++ b/clang/test/CIR/CodeGen/virtual-function-calls.cpp @@ -27,8 +27,8 @@ A::A() {} // CIR: cir.store %arg0, %[[THIS_ADDR]] : !cir.ptr, !cir.ptr> // CIR: %[[THIS:.*]] = cir.load %[[THIS_ADDR]] : !cir.ptr>, !cir.ptr // CIR: %[[VPTR:.*]] = cir.vtable.address_point(@_ZTV1A, address_point = ) : !cir.vptr -// CIR: %[[THIS_VPTR_PTR:.*]] = cir.cast(bitcast, %[[THIS]] : !cir.ptr), !cir.ptr -// CIR: cir.store align(8) %[[VPTR]], %[[THIS_VPTR_PTR]] : !cir.vptr, !cir.ptr +// CIR: %[[THIS_VPTR_PTR:.*]] = cir.vtable.get_vptr %[[THIS]] : !cir.ptr -> !cir.ptr +// CIR: cir.store{{.*}} align(8) %[[VPTR]], %[[THIS_VPTR_PTR]] : !cir.vptr, !cir.ptr // CIR: cir.return // LLVM: define{{.*}} void @_ZN1AC2Ev(ptr %[[ARG0:.*]]) diff --git a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp new file mode 100644 index 000000000000..7571e5e3306f --- /dev/null +++ b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp @@ -0,0 +1,571 @@ +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s + +struct NoCopyConstruct {}; + +struct CopyConstruct { + CopyConstruct() = default; + CopyConstruct(const CopyConstruct&); +}; + +struct NonDefaultCtor { + NonDefaultCtor(); +}; + +struct HasDtor { + ~HasDtor(); +}; + +// CHECK: acc.firstprivate.recipe @firstprivatization__ZTSA5_7HasDtor : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// +// CHECK-NEXT: } destroy { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i +// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr>), !cir.ptr +// CHECK-NEXT: %[[ELEM:.*]] = cir.ptr_stride(%[[ARRPTR]] : !cir.ptr, %[[LAST_IDX]] : !u64i), !cir.ptr +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["__array_idx"] +// CHECK-NEXT: cir.store %[[ELEM]], %[[ITR]] : !cir.ptr, !cir.ptr> +// CHECK-NEXT: cir.do { +// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr>, !cir.ptr +// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ELEM_LOAD]]) nothrow : (!cir.ptr) -> () +// CHECK-NEXT: %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s64i +// CHECK-NEXT: %[[PREVELEM:.*]] = cir.ptr_stride(%[[ELEM_LOAD]] : !cir.ptr, %[[NEG_ONE]] : !s64i), !cir.ptr +// CHECK-NEXT: cir.store %[[PREVELEM]], %[[ITR]] : !cir.ptr, !cir.ptr> +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } while { +// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr>, !cir.ptr +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[ELEM_LOAD]], %[[ARRPTR]]) : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_14NonDefaultCtor : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_13CopyConstruct : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_15NoCopyConstruct : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_f : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_i : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS7HasDtor : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_HasDtor, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } destroy { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ARG]]) nothrow : (!cir.ptr) -> () +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS14NonDefaultCtor : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_NonDefaultCtor, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS13CopyConstruct : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_CopyConstruct, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS15NoCopyConstruct : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_NoCopyConstruct, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSf : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !cir.float, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSi : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } + +extern "C" void acc_combined() { + // CHECK: cir.func{{.*}} @acc_combined() { + + int someInt; + // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr, ["someInt"] + float someFloat; + // CHECK-NEXT: %[[SOMEFLOAT:.*]] = cir.alloca !cir.float, !cir.ptr, ["someFloat"] + NoCopyConstruct noCopy; + // CHECK-NEXT: %[[NOCOPY:.*]] = cir.alloca !rec_NoCopyConstruct, !cir.ptr, ["noCopy"] + CopyConstruct hasCopy; + // CHECK-NEXT: %[[HASCOPY:.*]] = cir.alloca !rec_CopyConstruct, !cir.ptr, ["hasCopy"] + NonDefaultCtor notDefCtor; + // CHECK-NEXT: %[[NOTDEFCTOR:.*]] = cir.alloca !rec_NonDefaultCtor, !cir.ptr, ["notDefCtor", init] + HasDtor dtor; + // CHECK-NEXT: %[[DTOR:.*]] = cir.alloca !rec_HasDtor, !cir.ptr, ["dtor"] + int someIntArr[5]; + // CHECK-NEXT: %[[INTARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["someIntArr"] + float someFloatArr[5]; + // CHECK-NEXT: %[[FLOATARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["someFloatArr"] + NoCopyConstruct noCopyArr[5]; + // CHECK-NEXT: %[[NOCOPYARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["noCopyArr"] + CopyConstruct hasCopyArr[5]; + // CHECK-NEXT: %[[HASCOPYARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["hasCopyArr"] + NonDefaultCtor notDefCtorArr[5]; + // CHECK-NEXT: %[[NOTDEFCTORARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["notDefCtorArr", init] + HasDtor dtorArr[5]; + // CHECK-NEXT: %[[DTORARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["dtorArr"] + // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1Ev(%[[NOTDEFCTOR]]) : (!cir.ptr) -> () + +#pragma acc parallel loop firstprivate(someInt) + for(int i = 0; i < 5; ++i); + // CHECK: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[SOMEINT]] : !cir.ptr) -> !cir.ptr {name = "someInt"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSi -> %[[PRIVATE]] : !cir.ptr) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(someFloat) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[SOMEFLOAT]] : !cir.ptr) -> !cir.ptr {name = "someFloat"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSf -> %[[PRIVATE]] : !cir.ptr) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel loop firstprivate(noCopy) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPY]] : !cir.ptr) -> !cir.ptr {name = "noCopy"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTS15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(hasCopy) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPY]] : !cir.ptr) -> !cir.ptr {name = "hasCopy"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTS13CopyConstruct -> %[[PRIVATE]] : !cir.ptr) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(notDefCtor) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTOR]] : !cir.ptr) -> !cir.ptr {name = "notDefCtor"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTS14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(dtor) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTOR]] : !cir.ptr) -> !cir.ptr {name = "dtor"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTS7HasDtor -> %[[PRIVATE]] : !cir.ptr) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel loop firstprivate(someInt, someFloat, noCopy, hasCopy, notDefCtor, dtor) + for(int i = 0; i < 5; ++i); + // CHECK: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[SOMEINT]] : !cir.ptr) -> !cir.ptr {name = "someInt"} + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[SOMEFLOAT]] : !cir.ptr) -> !cir.ptr {name = "someFloat"} + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPY]] : !cir.ptr) -> !cir.ptr {name = "noCopy"} + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[HASCOPY]] : !cir.ptr) -> !cir.ptr {name = "hasCopy"} + // CHECK-NEXT: %[[PRIVATE5:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTOR]] : !cir.ptr) -> !cir.ptr {name = "notDefCtor"} + // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTOR]] : !cir.ptr) -> !cir.ptr {name = "dtor"} + // CHECK: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSi -> %[[PRIVATE1]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTSf -> %[[PRIVATE2]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS7HasDtor -> %[[PRIVATE6]] : !cir.ptr) + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop firstprivate(someIntArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1]"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(someFloatArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(noCopyArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1]"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(hasCopyArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(notDefCtorArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(dtorArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(someIntArr[1], someFloatArr[1], noCopyArr[1], hasCopyArr[1], notDefCtorArr[1], dtorArr[1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE5:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1]"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr>) + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel loop firstprivate(someIntArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1:1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(someFloatArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1:1]"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(noCopyArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1:1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial loop firstprivate(hasCopyArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1:1]"} + // CHECK-NEXT: acc.serial combined(loop) firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(serial) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(notDefCtorArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1:1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(dtorArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1:1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr>) { + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel loop firstprivate(someIntArr[1:1], someFloatArr[1:1], noCopyArr[1:1], hasCopyArr[1:1], notDefCtorArr[1:1], dtorArr[1:1]) + for(int i = 0; i < 5; ++i); + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE5:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1:1]"} + // CHECK-NEXT: acc.parallel combined(loop) firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr>) + // CHECK-NEXT: acc.loop combined(parallel) + // CHECK: acc.yield + // CHECK-NEXT: } loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +} diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause-templates.cpp b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause-templates.cpp new file mode 100644 index 000000000000..00aaaba3663f --- /dev/null +++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause-templates.cpp @@ -0,0 +1,90 @@ +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s + +struct CopyConstruct { + CopyConstruct() = default; + CopyConstruct(const CopyConstruct&); +}; + +struct NonDefaultCtor { + NonDefaultCtor(); +}; + +struct HasDtor { + ~HasDtor(); +}; + +// CHECK: acc.firstprivate.recipe @firstprivatization__ZTSi : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS7HasDtor : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_HasDtor, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } destroy { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ARG]]) nothrow : (!cir.ptr) -> () +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS14NonDefaultCtor : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_NonDefaultCtor, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS13CopyConstruct : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_CopyConstruct, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } + +template +void dependent_version(const T &cc, const U &ndc, const V &dtor, const W &someInt) { + // CHECK: cir.func {{.*}}@_Z17dependent_versionI13CopyConstruct14NonDefaultCtor7HasDtoriEvRKT_RKT0_RKT1_RKT2_(%[[ARG0:.*]]: !cir.ptr {{.*}}, %[[ARG1:.*]]: !cir.ptr {{.*}}, %[[ARG2:.*]]: !cir.ptr {{.*}}, %[[ARG3:.*]]: !cir.ptr {{.*}}) { + // CHECK-NEXT: %[[CC:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["cc", init, const] + // CHECK-NEXT: %[[NDC:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["ndc", init, const] + // CHECK-NEXT: %[[DTOR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["dtor", init, const] + // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["someInt", init, const] + // % 3 = cir.alloca !cir.ptr, !cir.ptr>, ["someInt", init, const] + +#pragma acc parallel firstprivate(cc, ndc, dtor, someInt) + ; + // CHECK: %[[PRIV_LOAD:.*]] = cir.load %[[CC]] : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[PRIV_LOAD]] : !cir.ptr) -> !cir.ptr {name = "cc"} + // CHECK-NEXT: %[[PRIV_LOAD:.*]] = cir.load %[[NDC]] : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[PRIV_LOAD]] : !cir.ptr) -> !cir.ptr {name = "ndc"} + // CHECK-NEXT: %[[PRIV_LOAD:.*]] = cir.load %[[DTOR]] : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[PRIV_LOAD]] : !cir.ptr) -> !cir.ptr {name = "dtor"} + // CHECK-NEXT: %[[PRIV_LOAD:.*]] = cir.load %[[SOMEINT]] : !cir.ptr>, !cir.ptr + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[PRIV_LOAD]] : !cir.ptr) -> !cir.ptr {name = "someInt"} + + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTS13CopyConstruct -> %[[PRIVATE1]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS14NonDefaultCtor -> %[[PRIVATE2]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS7HasDtor -> %[[PRIVATE3]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTSi -> %[[PRIVATE4]] : !cir.ptr) { + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +} + +void use() { + CopyConstruct cc; + NonDefaultCtor ndc; + HasDtor dtor; + int i; + dependent_version(cc, ndc, dtor, i); +} diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp new file mode 100644 index 000000000000..924dbf6254ee --- /dev/null +++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp @@ -0,0 +1,508 @@ +// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s + +struct NoCopyConstruct {}; + +struct CopyConstruct { + CopyConstruct() = default; + CopyConstruct(const CopyConstruct&); +}; + +struct NonDefaultCtor { + NonDefaultCtor(); +}; + +struct HasDtor { + ~HasDtor(); +}; + +// CHECK: acc.firstprivate.recipe @firstprivatization__ZTSA5_7HasDtor : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// +// CHECK-NEXT: } destroy { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: %[[LAST_IDX:.*]] = cir.const #cir.int<4> : !u64i +// CHECK-NEXT: %[[ARRPTR:.*]] = cir.cast(array_to_ptrdecay, %[[ARG]] : !cir.ptr>), !cir.ptr +// CHECK-NEXT: %[[ELEM:.*]] = cir.ptr_stride(%[[ARRPTR]] : !cir.ptr, %[[LAST_IDX]] : !u64i), !cir.ptr +// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !cir.ptr, !cir.ptr>, ["__array_idx"] +// CHECK-NEXT: cir.store %[[ELEM]], %[[ITR]] : !cir.ptr, !cir.ptr> +// CHECK-NEXT: cir.do { +// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr>, !cir.ptr +// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ELEM_LOAD]]) nothrow : (!cir.ptr) -> () +// CHECK-NEXT: %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s64i +// CHECK-NEXT: %[[PREVELEM:.*]] = cir.ptr_stride(%[[ELEM_LOAD]] : !cir.ptr, %[[NEG_ONE]] : !s64i), !cir.ptr +// CHECK-NEXT: cir.store %[[PREVELEM]], %[[ITR]] : !cir.ptr, !cir.ptr> +// CHECK-NEXT: cir.yield +// CHECK-NEXT: } while { +// CHECK-NEXT: %[[ELEM_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr>, !cir.ptr +// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(ne, %[[ELEM_LOAD]], %[[ARRPTR]]) : !cir.ptr, !cir.bool +// CHECK-NEXT: cir.condition(%[[CMP]]) +// CHECK-NEXT: } +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_14NonDefaultCtor : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_13CopyConstruct : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_15NoCopyConstruct : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_f : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSA5_i : !cir.ptr> init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr> {{.*}}): +// CHECK-NEXT: cir.alloca !cir.array, !cir.ptr>, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FORM:.*]]: !cir.ptr> {{.*}}, %[[ARG_TO:.*]]: !cir.ptr> {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS7HasDtor : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_HasDtor, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } destroy { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.call @_ZN7HasDtorD1Ev(%[[ARG]]) nothrow : (!cir.ptr) -> () +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS14NonDefaultCtor : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_NonDefaultCtor, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS13CopyConstruct : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_CopyConstruct, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTS15NoCopyConstruct : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !rec_NoCopyConstruct, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSf : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !cir.float, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } +// +// CHECK-NEXT: acc.firstprivate.recipe @firstprivatization__ZTSi : !cir.ptr init { +// CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr {{.*}}): +// CHECK-NEXT: cir.alloca !s32i, !cir.ptr, ["openacc.firstprivate.init"] +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } copy { +// CHECK-NEXT: ^bb0(%[[ARG_FROM:.*]]: !cir.ptr {{.*}}, %[[ARG_TO:.*]]: !cir.ptr {{.*}}): +// +// CHECK-NEXT: acc.yield +// CHECK-NEXT: } + +extern "C" void acc_compute() { + // CHECK: cir.func{{.*}} @acc_compute() { + + int someInt; + // CHECK-NEXT: %[[SOMEINT:.*]] = cir.alloca !s32i, !cir.ptr, ["someInt"] + float someFloat; + // CHECK-NEXT: %[[SOMEFLOAT:.*]] = cir.alloca !cir.float, !cir.ptr, ["someFloat"] + NoCopyConstruct noCopy; + // CHECK-NEXT: %[[NOCOPY:.*]] = cir.alloca !rec_NoCopyConstruct, !cir.ptr, ["noCopy"] + CopyConstruct hasCopy; + // CHECK-NEXT: %[[HASCOPY:.*]] = cir.alloca !rec_CopyConstruct, !cir.ptr, ["hasCopy"] + NonDefaultCtor notDefCtor; + // CHECK-NEXT: %[[NOTDEFCTOR:.*]] = cir.alloca !rec_NonDefaultCtor, !cir.ptr, ["notDefCtor", init] + HasDtor dtor; + // CHECK-NEXT: %[[DTOR:.*]] = cir.alloca !rec_HasDtor, !cir.ptr, ["dtor"] + int someIntArr[5]; + // CHECK-NEXT: %[[INTARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["someIntArr"] + float someFloatArr[5]; + // CHECK-NEXT: %[[FLOATARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["someFloatArr"] + NoCopyConstruct noCopyArr[5]; + // CHECK-NEXT: %[[NOCOPYARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["noCopyArr"] + CopyConstruct hasCopyArr[5]; + // CHECK-NEXT: %[[HASCOPYARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["hasCopyArr"] + NonDefaultCtor notDefCtorArr[5]; + // CHECK-NEXT: %[[NOTDEFCTORARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["notDefCtorArr", init] + HasDtor dtorArr[5]; + // CHECK-NEXT: %[[DTORARR:.*]] = cir.alloca !cir.array, !cir.ptr>, ["dtorArr"] + // CHECK-NEXT: cir.call @_ZN14NonDefaultCtorC1Ev(%[[NOTDEFCTOR]]) : (!cir.ptr) -> () + +#pragma acc parallel firstprivate(someInt) + ; + // CHECK: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[SOMEINT]] : !cir.ptr) -> !cir.ptr {name = "someInt"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSi -> %[[PRIVATE]] : !cir.ptr) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(someFloat) + ; + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[SOMEFLOAT]] : !cir.ptr) -> !cir.ptr {name = "someFloat"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSf -> %[[PRIVATE]] : !cir.ptr) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel firstprivate(noCopy) + ; + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPY]] : !cir.ptr) -> !cir.ptr {name = "noCopy"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTS15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(hasCopy) + ; + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPY]] : !cir.ptr) -> !cir.ptr {name = "hasCopy"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTS13CopyConstruct -> %[[PRIVATE]] : !cir.ptr) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(notDefCtor) + ; + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTOR]] : !cir.ptr) -> !cir.ptr {name = "notDefCtor"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTS14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(dtor) + ; + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTOR]] : !cir.ptr) -> !cir.ptr {name = "dtor"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTS7HasDtor -> %[[PRIVATE]] : !cir.ptr) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel firstprivate(someInt, someFloat, noCopy, hasCopy, notDefCtor, dtor) + ; + // CHECK: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[SOMEINT]] : !cir.ptr) -> !cir.ptr {name = "someInt"} + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[SOMEFLOAT]] : !cir.ptr) -> !cir.ptr {name = "someFloat"} + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPY]] : !cir.ptr) -> !cir.ptr {name = "noCopy"} + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[HASCOPY]] : !cir.ptr) -> !cir.ptr {name = "hasCopy"} + // CHECK-NEXT: %[[PRIVATE5:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTOR]] : !cir.ptr) -> !cir.ptr {name = "notDefCtor"} + // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTOR]] : !cir.ptr) -> !cir.ptr {name = "dtor"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSi -> %[[PRIVATE1]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTSf -> %[[PRIVATE2]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr, + // CHECK-SAME: @firstprivatization__ZTS7HasDtor -> %[[PRIVATE6]] : !cir.ptr) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial firstprivate(someIntArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1]"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(someFloatArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(noCopyArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1]"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(hasCopyArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(notDefCtorArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(dtorArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(someIntArr[1], someFloatArr[1], noCopyArr[1], hasCopyArr[1], notDefCtorArr[1], dtorArr[1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE5:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE_CONST:.*]] = arith.constant 1 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CONST]] : i64) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1]"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc parallel firstprivate(someIntArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1:1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(someFloatArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1:1]"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_f -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(noCopyArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1:1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc serial firstprivate(hasCopyArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1:1]"} + // CHECK-NEXT: acc.serial firstprivate(@firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(notDefCtorArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1:1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(dtorArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1:1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +#pragma acc parallel firstprivate(someIntArr[1:1], someFloatArr[1:1], noCopyArr[1:1], hasCopyArr[1:1], notDefCtorArr[1:1], dtorArr[1:1]) + ; + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE1:.*]] = acc.firstprivate varPtr(%[[INTARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someIntArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE2:.*]] = acc.firstprivate varPtr(%[[FLOATARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "someFloatArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE3:.*]] = acc.firstprivate varPtr(%[[NOCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "noCopyArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE4:.*]] = acc.firstprivate varPtr(%[[HASCOPYARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "hasCopyArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE5:.*]] = acc.firstprivate varPtr(%[[NOTDEFCTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "notDefCtorArr[1:1]"} + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i + // CHECK-NEXT: %[[ONE_CAST2:.*]] = builtin.unrealized_conversion_cast %[[ONE]] : !s32i to si32 + // CHECK-NEXT: %[[ZERO_CONST:.*]] = arith.constant 0 + // CHECK-NEXT: %[[ONE_CONST2:.*]] = arith.constant 1 + // CHECK-NEXT: %[[BOUNDS:.*]] = acc.bounds lowerbound(%[[ONE_CAST]] : si32) extent(%[[ONE_CAST2]] : si32) stride(%[[ONE_CONST2]] : i64) startIdx(%[[ZERO_CONST]] : i64) + // CHECK-NEXT: %[[PRIVATE6:.*]] = acc.firstprivate varPtr(%[[DTORARR]] : !cir.ptr>) bounds(%[[BOUNDS]]) -> !cir.ptr> {name = "dtorArr[1:1]"} + // CHECK-NEXT: acc.parallel firstprivate(@firstprivatization__ZTSA5_i -> %[[PRIVATE1]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_f -> %[[PRIVATE2]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_15NoCopyConstruct -> %[[PRIVATE3]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_13CopyConstruct -> %[[PRIVATE4]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_14NonDefaultCtor -> %[[PRIVATE5]] : !cir.ptr>, + // CHECK-SAME: @firstprivatization__ZTSA5_7HasDtor -> %[[PRIVATE6]] : !cir.ptr>) + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc +} diff --git a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp index 0bf932ea62ce..da45aca13e7f 100644 --- a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp +++ b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp @@ -10,9 +10,6 @@ void HelloWorld(int *A, int *B, int *C, int N) { // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Declare Construct}} #pragma acc declare create(A) - // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: firstprivate}} -#pragma acc parallel loop firstprivate(A) - for(int i = 0; i <5; ++i); // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Clause: reduction}} #pragma acc parallel loop reduction(+:A) for(int i = 0; i <5; ++i); diff --git a/clang/test/CIR/IR/invalid-goto.cir b/clang/test/CIR/IR/invalid-goto.cir new file mode 100644 index 000000000000..9f58bac92fa3 --- /dev/null +++ b/clang/test/CIR/IR/invalid-goto.cir @@ -0,0 +1,9 @@ +// RUN: cir-opt %s -verify-diagnostics -split-input-file + +// expected-error@+1 {{goto/label mismatch}} +cir.func @bad_goto() -> () { + cir.goto "somewhere" +^bb1: + cir.label "label" + cir.return +} diff --git a/clang/test/CXX/class/class.mem/p13.cpp b/clang/test/CXX/class/class.mem/p13.cpp index d947586c4194..a30aa5d0b2ee 100644 --- a/clang/test/CXX/class/class.mem/p13.cpp +++ b/clang/test/CXX/class/class.mem/p13.cpp @@ -114,3 +114,12 @@ template struct CtorDtorName : B { CtorDtorName(); ~CtorDtorName(); // expected-error {{identifier 'CtorDtorName' after '~' in destructor name does not name a type}} }; + +struct S { // expected-note {{'S' declared here}} + enum E { + R = 11, + S = 12 // expected-error {{member 'S' has the same name as its class}} + }; + static_assert(E::R == 11, "E::R is not 11"); + static_assert(E::S == 12, "E::S is not 12"); // expected-error {{no member named 'S' in 'S::E'}} +}; diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp index 0012ab976baa..7f933a4dcc6b 100644 --- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp +++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.nodiscard/p2.cpp @@ -115,7 +115,7 @@ void usage() { S(); // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute}} S('A'); // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard' attribute: Don't let that S-Char go!}} S(1); - S(2.2); + S(2.2); // expected-warning {{ignoring temporary created by a constructor declared with 'gnu::warn_unused_result' attribute}} Y(); // expected-warning {{ignoring temporary of type 'Y' declared with 'nodiscard' attribute: Don't throw me away either!}} S s; ConvertTo{}; // expected-warning {{ignoring return value of type 'ConvertTo' declared with 'nodiscard' attribute: Don't throw me away!}} diff --git a/clang/test/CXX/module/cpp.pre/module_decl.cpp b/clang/test/CXX/module/cpp.pre/module_decl.cpp index 6238347c167a..5c29aeff1b63 100644 --- a/clang/test/CXX/module/cpp.pre/module_decl.cpp +++ b/clang/test/CXX/module/cpp.pre/module_decl.cpp @@ -1,8 +1,147 @@ // RUN: rm -rf %t // RUN: mkdir -p %t -// RUN: %clang_cc1 -std=c++20 -emit-module-interface %s -verify -o %t/M.pcm +// RUN: split-file %s %t +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/line.cpp -verify -o %t/line.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/gnu_line_marker.cpp -verify -o %t/gnu_line_marker.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/include.cpp -verify -o %t/include.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/ident.cpp -verify -o %t/ident.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_comment.cpp -verify -o %t/pragma_comment.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_mark.cpp -verify -o %t/pragma_mark.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_detect_mismatch.cpp -verify -o %t/pragma_detect_mismatch.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_clang_debug.cpp -verify -o %t/pragma_clang_debug.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_message.cpp -verify -o %t/pragma_message.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_gcc_warn.cpp -verify -o %t/pragma_gcc_warn.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_gcc_error.cpp -verify -o %t/pragma_gcc_error.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_diag_push_pop.cpp -verify -o %t/pragma_diag_push_pop.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_diag_ignore.cpp -verify -o %t/pragma_diag_ignore.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_opencl_ext.cpp -verify -o %t/pragma_opencl_ext.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_push_pop.cpp -verify -o %t/pragma_push_pop.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_exec_charset.cpp -verify -o %t/pragma_exec_charset.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/pragma_clang_assume_nonnull.cpp -verify -o %t/pragma_clang_assume_nonnull.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/marco_expand.cpp -DMACRO="" -verify -o %t/marco_expand.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/define.cpp -verify -o %t/define.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/undef.cpp -verify -o %t/undef.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/defined.cpp -verify -o %t/defined.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/has_embed.cpp -verify -o %t/has_embed.pcm +// RUN: %clang_cc1 -std=c++20 -emit-module-interface %t/has_include.cpp -verify -o %t/has_include.pcm +//--- header.h +#ifndef HEADER_H +#define HEADER_H + +#endif // HEADER_H + +//--- line.cpp +// expected-no-diagnostics +#line 3 +export module M; + +//--- gnu_line_marker.cpp +// expected-no-diagnostics +# 1 __FILE__ 1 3 +export module M; + +//--- include.cpp +#include "header.h" // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} +export module M; // expected-error {{module declaration must occur at the start of the translation unit}} + +//--- ident.cpp +// expected-no-diagnostics +#ident "$Header:$" +export module M; + +//--- pragma_comment.cpp +// expected-no-diagnostics +#pragma comment(lib, "msvcrt.lib") +export module M; + +//--- pragma_mark.cpp +// expected-no-diagnostics +#pragma mark LLVM's world +export module M; + +//--- pragma_detect_mismatch.cpp +// expected-no-diagnostics +#pragma detect_mismatch("test", "1") +export module M; + +//--- pragma_clang_debug.cpp +// expected-no-diagnostics +#pragma clang __debug dump Test +export module M; + +//--- pragma_message.cpp +#pragma message "test" // expected-warning {{test}} +export module M; + +//--- pragma_gcc_warn.cpp +#pragma GCC warning "Foo" // expected-warning {{Foo}} +export module M; + +//--- pragma_gcc_error.cpp +#pragma GCC error "Foo" // expected-error {{Foo}} +export module M; + +//--- pragma_diag_push_pop.cpp +// expected-no-diagnostics +#pragma gcc diagnostic push +#pragma gcc diagnostic pop +export module M; + +//--- pragma_diag_ignore.cpp +// expected-no-diagnostics +#pragma GCC diagnostic ignored "-Wframe-larger-than" +export module M; + +//--- pragma_opencl_ext.cpp +// expected-no-diagnostics +#pragma OPENCL EXTENSION __cl_clang_variadic_functions : enable +export module M; + +//--- pragma_push_pop.cpp +// expected-no-diagnostics +#pragma warning(push) +#pragma warning(pop) +export module M; + +//--- pragma_exec_charset.cpp +// expected-no-diagnostics +#pragma execution_character_set(push, "UTF-8") +#pragma execution_character_set(pop) +export module M; + +//--- pragma_clang_assume_nonnull.cpp +// expected-no-diagnostics +#pragma clang assume_nonnull begin +#pragma clang assume_nonnull end +export module M; + +//--- marco_expand.cpp +MACRO // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} +export module M; // expected-error {{module declaration must occur at the start of the translation unit}} + +//--- define.cpp // This is a comment #define I32 int // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} export module M; // expected-error {{module declaration must occur at the start of the translation unit}} export I32 i32; + +//--- undef.cpp +#undef FOO // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} +export module M; // expected-error {{module declaration must occur at the start of the translation unit}} + +//--- defined.cpp +#if defined(FOO) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} +#endif +export module M; // expected-error {{module declaration must occur at the start of the translation unit}} + +//--- has_embed.cpp +#if __has_embed(__FILE__ ext::token(0xB055)) // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} +#endif +export module M; // expected-error {{module declaration must occur at the start of the translation unit}} + +//--- has_include.cpp +#if __has_include() || __has_include_next() // expected-note {{add 'module;' to the start of the file to introduce a global module fragment}} \ + // expected-warning {{#include_next in primary source file; will search from start of include path}} +#endif +export module M; // expected-error {{module declaration must occur at the start of the translation unit}} diff --git a/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp b/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp index abb42447d3e0..05830de9891f 100644 --- a/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp +++ b/clang/test/CXX/stmt.stmt/stmt.select/stmt.if/p2.cpp @@ -239,5 +239,21 @@ void f2() { } +namespace GH153884 { + bool f1() { + auto f = [](auto) { return true; }; + if constexpr (0) + return f(1); + return false; + } + bool f2() { + auto f = [](auto x) { if (x) return 1.5; else return "wat"; }; + // expected-error@-1 {{'auto' in return type deduced as 'const char *' here but deduced as 'double' in earlier return statement}} + if constexpr (0) + return f(1); + // expected-note@-1 {{in instantiation of function template specialization 'GH153884::f2()}} + return false; + } +} #endif diff --git a/clang/test/CodeGen/RISCV/riscv-inline-asm-fixed-length-vector.c b/clang/test/CodeGen/RISCV/riscv-inline-asm-fixed-length-vector.c new file mode 100644 index 000000000000..699c588950c6 --- /dev/null +++ b/clang/test/CodeGen/RISCV/riscv-inline-asm-fixed-length-vector.c @@ -0,0 +1,66 @@ +// REQUIRES: riscv-registered-target + +// RUN: %clang_cc1 -triple riscv32 -target-feature +v \ +// RUN: -mvscale-min=2 -mvscale-max=2 -O2 -emit-llvm %s -o - \ +// RUN: | FileCheck %s +// RUN: %clang_cc1 -triple riscv64 -target-feature +v \ +// RUN: -mvscale-min=2 -mvscale-max=2 -O2 -emit-llvm %s -o - \ +// RUN: | FileCheck %s + +// Test RISC-V V-extension fixed-length vector inline assembly constraints. +#include +#include + +typedef vbool1_t fixed_bool1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint32m1_t fixed_i32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint8mf2_t fixed_i8mf2_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen / 2))); + +typedef bool bx2 __attribute__((ext_vector_type(16))); +typedef int i32x2 __attribute__((ext_vector_type(2))); +typedef char i8x4 __attribute__((ext_vector_type(4))); + +fixed_i32m1_t test_vr(fixed_i32m1_t a) { +// CHECK-LABEL: define{{.*}} @test_vr +// CHECK: %0 = tail call <4 x i32> asm sideeffect "vadd.vv $0, $1, $2", "=^vr,^vr,^vr"(<4 x i32> %a, <4 x i32> %a) + fixed_i32m1_t ret; + asm volatile ("vadd.vv %0, %1, %2" : "=vr"(ret) : "vr"(a), "vr"(a)); + return ret; +} + +i32x2 test_vr2(i32x2 a) { +// CHECK-LABEL: define{{.*}} @test_vr2 +// CHECK: %1 = tail call <2 x i32> asm sideeffect "vadd.vv $0, $1, $2", "=^vr,^vr,^vr"(<2 x i32> %0, <2 x i32> %0) + i32x2 ret; + asm volatile ("vadd.vv %0, %1, %2" : "=vr"(ret) : "vr"(a), "vr"(a)); + return ret; +} + +fixed_i8mf2_t test_vd(fixed_i8mf2_t a) { +// CHECK-LABEL: define{{.*}} @test_vd +// CHECK: %0 = tail call <8 x i8> asm sideeffect "vadd.vv $0, $1, $2", "=^vd,^vr,^vr"(<8 x i8> %a, <8 x i8> %a) + fixed_i8mf2_t ret; + asm volatile ("vadd.vv %0, %1, %2" : "=vd"(ret) : "vr"(a), "vr"(a)); + return ret; +} + +i8x4 test_vd2(i8x4 a) { +// CHECK-LABEL: define{{.*}} @test_vd2 +// CHECK: %1 = tail call <4 x i8> asm sideeffect "vadd.vv $0, $1, $2", "=^vd,^vr,^vr"(<4 x i8> %0, <4 x i8> %0) + i8x4 ret; + asm volatile ("vadd.vv %0, %1, %2" : "=vd"(ret) : "vr"(a), "vr"(a)); + return ret; +} + +fixed_bool1_t test_vm(fixed_bool1_t a) { +// CHECK-LABEL: define{{.*}} @test_vm +// CHECK: %1 = tail call <16 x i8> asm sideeffect "vmand.mm $0, $1, $2", "=^vm,^vm,^vm"(<16 x i8> %a, <16 x i8> %a) + fixed_bool1_t ret; + asm volatile ("vmand.mm %0, %1, %2" : "=vm"(ret) : "vm"(a), "vm"(a)); + return ret; +} + +void test_vm2(bx2 a) { +// CHECK-LABEL: define{{.*}} @test_vm2 +// CHECK: tail call void asm sideeffect "dummy $0", "^vm"(<16 x i1> %a1) + asm volatile ("dummy %0" :: "vm"(a)); +} diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index e7f81068ab4f..5b252fa315ef 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -920,6 +920,7 @@ __m256i test_mm256_mul_epi32(__m256i a, __m256i b) { // CHECK: mul <4 x i64> %{{.*}}, %{{.*}} return _mm256_mul_epi32(a, b); } +TEST_CONSTEXPR(match_m256i(_mm256_mul_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 36, -40, -28)); __m256i test_mm256_mul_epu32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mul_epu32 @@ -928,6 +929,7 @@ __m256i test_mm256_mul_epu32(__m256i a, __m256i b) { // CHECK: mul <4 x i64> %{{.*}}, %{{.*}} return _mm256_mul_epu32(a, b); } +TEST_CONSTEXPR(match_m256i(_mm256_mul_epu32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-16, -14, +12, +10, -8, +6, -4, +2}), 4294967280, 36, 21474836440, 30064771044)); __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mulhi_epu16 diff --git a/clang/test/CodeGen/X86/avx512bitalg-builtins.c b/clang/test/CodeGen/X86/avx512bitalg-builtins.c index 30d364a28364..8092f2d30214 100644 --- a/clang/test/CodeGen/X86/avx512bitalg-builtins.c +++ b/clang/test/CodeGen/X86/avx512bitalg-builtins.c @@ -19,12 +19,15 @@ __m512i test_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) { // CHECK: select <32 x i1> %{{[0-9]+}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_popcnt_epi16(__A, __U, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mask_popcnt_epi16(_mm512_set1_epi16(-1), 0xF0F0F0F0, (__m512i)(__v32hi){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025, +5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), -1, -1, -1, -1, 0, 8, 1, 9, -1, -1, -1, -1, 6, 2, 9, 2, -1, -1, -1, -1, 0, 8, 1, 9, -1, -1, -1, -1, 6, 2, 9, 2)); + __m512i test_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_popcnt_epi16 // CHECK: @llvm.ctpop.v32i16 // CHECK: select <32 x i1> %{{[0-9]+}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_popcnt_epi16(__U, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_maskz_popcnt_epi16(0x0F0F0F0F, (__m512i)(__v32hi){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025, +5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), 2, 15, 14, 1, 0, 0, 0, 0, 2, 2, 4, 2, 0, 0, 0, 0, 2, 15, 14, 1, 0, 0, 0, 0, 2, 2, 4, 2, 0, 0, 0, 0)); __m512i test_mm512_popcnt_epi8(__m512i __A) { // CHECK-LABEL: test_mm512_popcnt_epi8 @@ -39,12 +42,15 @@ __m512i test_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) { // CHECK: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_popcnt_epi8(__A, __U, __B); } +TEST_CONSTEXPR(match_v64qi(_mm512_mask_popcnt_epi8(_mm512_set1_epi8(-1), 0xF0F0F0F00F0F0F0FULL, (__m512i)(__v64qi){+5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73}), 2, 7, 6, 1, -1, -1, -1, -1, 2, 2, 4, 2, -1, -1, -1, -1, 2, 7, 6, 1, -1, -1, -1, -1, 2, 2, 4, 2, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 1, 4, -1, -1, -1, -1, 6, 2, 4, 3, -1, -1, -1, -1, 0, 4, 1, 4, -1, -1, -1, -1, 6, 2, 4, 3)); + __m512i test_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_popcnt_epi8 // CHECK: @llvm.ctpop.v64i8 // CHECK: select <64 x i1> %{{[0-9]+}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_popcnt_epi8(__U, __B); } +TEST_CONSTEXPR(match_v64qi(_mm512_maskz_popcnt_epi8(0x0F0F0F0FF0F0F0F0ULL, (__m512i)(__v64qi){+5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73}), 0, 0, 0, 0, 0, 4, 1, 4, 0, 0, 0, 0, 6, 2, 4, 3, 0, 0, 0, 0, 0, 4, 1, 4, 0, 0, 0, 0, 6, 2, 4, 3, 2, 7, 6, 1, 0, 0, 0, 0, 2, 2, 4, 2, 0, 0, 0, 0, 2, 7, 6, 1, 0, 0, 0, 0, 2, 2, 4, 2, 0, 0, 0, 0)); __mmask64 test_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_bitshuffle_epi64_mask diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index b8eadc42c78f..0f772e256f86 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -3027,6 +3027,7 @@ __m512i test_mm512_mul_epi32(__m512i __A, __m512i __B) { //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} return _mm512_mul_epi32(__A,__B); } +TEST_CONSTEXPR(match_v8di(_mm512_mul_epi32((__m512i)(__v16si){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16}, (__m512i)(__v16si){-32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -32, 84, -120, 140, -144, 132, -104, -60)); __m512i test_mm512_maskz_mul_epi32 (__mmask8 __k,__m512i __A, __m512i __B) { //CHECK-LABEL: test_mm512_maskz_mul_epi32 @@ -3057,6 +3058,7 @@ __m512i test_mm512_mul_epu32 (__m512i __A, __m512i __B) { //CHECK: mul <8 x i64> %{{.*}}, %{{.*}} return _mm512_mul_epu32(__A,__B); } +TEST_CONSTEXPR(match_m512i(_mm512_mul_epu32((__m512i)(__v16si){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16}, (__m512i)(__v16si){-32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), 4294967264, 84, 21474836360, 140, 38654705520, 132, 55834574744, 64424509380)); __m512i test_mm512_maskz_mul_epu32 (__mmask8 __k,__m512i __A, __m512i __B) { //CHECK-LABEL: test_mm512_maskz_mul_epu32 @@ -8953,29 +8955,34 @@ __m512d test_mm512_maskz_cvtps_pd(__mmask8 __U, __m256 __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_cvtps_pd(__U, __A); } + __m512d test_mm512_mask_mov_pd(__m512d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_mask_mov_pd // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_mov_pd(__W, __U, __A); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_mov_pd((__m512d){-8.0, -7.0, -6.0, -5.0, -4.0, -3.0, -2.0, -1.0}, 0xC3, (__m512d){+1.0, +2.0, +3.0, +4.0, +5.0, +6.0, +7.0, +8.0}), +1.0, +2.0, -6.0, -5.0, -4.0, -3.0, +7.0, +8.0)); __m512d test_mm512_maskz_mov_pd(__mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_maskz_mov_pd // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_mov_pd(__U, __A); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_mov_pd(0xC3, (__m512d){+1.0, +2.0, +3.0, +4.0, +5.0, +6.0, +7.0, +8.0}), +1.0, +2.0, +0.0, +0.0, +0.0, +0.0, +7.0, +8.0)); __m512 test_mm512_mask_mov_ps(__m512 __W, __mmask16 __U, __m512 __A) { // CHECK-LABEL: test_mm512_mask_mov_ps // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_mov_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m512(_mm512_mask_mov_ps((__m512){-16.0f, -15.0f, -14.0f, -13.0f, -12.0f, -11.0f, -10.0f, -9.0f, -8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f}, 0x0FF0, (__m512){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f, +9.0f, +10.0f, +11.0f, +12.0f, +13.0f, +14.0f, +15.0f, +16.0f}), -16.0f, -15.0f, -14.0f, -13.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, -4.0f, -3.0f, -2.0f, -1.0f)); __m512 test_mm512_maskz_mov_ps(__mmask16 __U, __m512 __A) { // CHECK-LABEL: test_mm512_maskz_mov_ps // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_mov_ps(__U, __A); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_mov_ps(0xF3F3, (__m512){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f, +9.0f, +10.0f, +11.0f, +12.0f, +13.0f, +14.0f, +15.0f, +16.0f}), +1.0f, +2.0f, 0.0f, 0.0f, +5.0f, +6.0f, +7.0f, +8.0f, +9.0f, +10.0f, 0.0f, 0.0f, +13.0f, +14.0f, +15.0f, +16.0f)); void test_mm512_mask_compressstoreu_pd(void *__P, __mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_mask_compressstoreu_pd diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index f1ff210b0b1d..a1c267dd51bf 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -9517,48 +9517,56 @@ __m128d test_mm_mask_mov_pd(__m128d __W, __mmask8 __U, __m128d __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_mask_mov_pd(__W, __U, __A); } +TEST_CONSTEXPR(match_m128d(_mm_mask_mov_pd((__m128d){-2.0, -1.0}, 0x2, (__m128d){+1.0, +2.0}), -2.0, +2.0)); __m128d test_mm_maskz_mov_pd(__mmask8 __U, __m128d __A) { // CHECK-LABEL: test_mm_maskz_mov_pd // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm_maskz_mov_pd(__U, __A); } +TEST_CONSTEXPR(match_m128d(_mm_maskz_mov_pd(0x1, (__m128d){+1.0, +2.0}), +1.0, +0.0)); __m256d test_mm256_mask_mov_pd(__m256d __W, __mmask8 __U, __m256d __A) { // CHECK-LABEL: test_mm256_mask_mov_pd // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_mov_pd(__W, __U, __A); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_mov_pd((__m256d){-4.0, -3.0, -2.0, -1.0}, 0x3, (__m256d){+1.0, +2.0, +3.0, +4.0}), +1.0, +2.0, -2.0, -1.0)); __m256d test_mm256_maskz_mov_pd(__mmask8 __U, __m256d __A) { // CHECK-LABEL: test_mm256_maskz_mov_pd // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_mov_pd(__U, __A); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_mov_pd(0xC, (__m256d){+1.0, +2.0, +3.0, +4.0}), 0.0, 0.0, +3.0, +4.0)); __m128 test_mm_mask_mov_ps(__m128 __W, __mmask8 __U, __m128 __A) { // CHECK-LABEL: test_mm_mask_mov_ps // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_mask_mov_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m128(_mm_mask_mov_ps((__m128){-4.0f, -3.0f, -2.0f, -1.0f}, 0x3, (__m128){+1.0f, +2.0f, +3.0f, +4.0f}), +1.0f, +2.0f, -2.0f, -1.0f)); __m128 test_mm_maskz_mov_ps(__mmask8 __U, __m128 __A) { // CHECK-LABEL: test_mm_maskz_mov_ps // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm_maskz_mov_ps(__U, __A); } +TEST_CONSTEXPR(match_m128(_mm_maskz_mov_ps(0xC, (__m128){+1.0f, +2.0f, +3.0f, +4.0f}), 0.0f, 0.0f, +3.0f, +4.0f)); __m256 test_mm256_mask_mov_ps(__m256 __W, __mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_mask_mov_ps // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_mov_ps(__W, __U, __A); } +TEST_CONSTEXPR(match_m256(_mm256_mask_mov_ps((__m256){-8.0f, -7.0f, -6.0f, -5.0f, -4.0f, -3.0f, -2.0f, -1.0f}, 0xC3, (__m256){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}), +1.0f, +2.0f, -6.0f, -5.0f, -4.0f, -3.0f, +7.0f, +8.0f)); __m256 test_mm256_maskz_mov_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_mov_ps // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_mov_ps(__U, __A); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_mov_ps(0xC3, (__m256){+1.0f, +2.0f, +3.0f, +4.0f, +5.0f, +6.0f, +7.0f, +8.0f}), +1.0f, +2.0f, 0.0f, 0.0f, 0.0f, 0.0f, +7.0f, +8.0f)); __m128 test_mm_mask_cvtph_ps(__m128 __W, __mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_mask_cvtph_ps diff --git a/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c b/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c index b53410ae4329..e2cfb3a348a9 100644 --- a/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c @@ -19,12 +19,15 @@ __m256i test_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) { // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_popcnt_epi16(__A, __U, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_mask_popcnt_epi16(_mm256_set1_epi16(-1), 0xF0F0, (__m256i)(__v16hi){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), -1, -1, -1, -1, 0, 8, 1, 9, -1, -1, -1, -1, 6, 2, 9, 2)); + __m256i test_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_popcnt_epi16 // CHECK: @llvm.ctpop.v16i16 // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_popcnt_epi16(__U, __B); } +TEST_CONSTEXPR(match_v16hi(_mm256_maskz_popcnt_epi16(0x0F0F, (__m256i)(__v16hi){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), 2, 15, 14, 1, 0, 0, 0, 0, 2, 2, 4, 2, 0, 0, 0, 0)); __m128i test_mm_popcnt_epi16(__m128i __A) { // CHECK-LABEL: test_mm_popcnt_epi16 @@ -39,12 +42,15 @@ __m128i test_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) { // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_popcnt_epi16(__A, __U, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_mask_popcnt_epi16(_mm_set1_epi16(-1), 0xF0, (__m128i)(__v8hi){+5, -3, -10, +8, 0, -256, +256, -128}), -1, -1, -1, -1, 0, 8, 1, 9)); + __m128i test_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) { // CHECK-LABEL: test_mm_maskz_popcnt_epi16 // CHECK: @llvm.ctpop.v8i16 // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_popcnt_epi16(__U, __B); } +TEST_CONSTEXPR(match_v8hi(_mm_maskz_popcnt_epi16(0x0F, (__m128i)(__v8hi){+5, -3, -10, +8, 0, -256, +256, -128}), 2, 15, 14, 1, 0, 0, 0, 0)); __m256i test_mm256_popcnt_epi8(__m256i __A) { // CHECK-LABEL: test_mm256_popcnt_epi8 @@ -59,12 +65,15 @@ __m256i test_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) { // CHECK: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_popcnt_epi8(__A, __U, __B); } +TEST_CONSTEXPR(match_v32qi(_mm256_mask_popcnt_epi8(_mm256_set1_epi8(-1), 0xF00F, (__m256i)(__v32qi){+5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73}), 2, 7, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 2, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1)); + __m256i test_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_popcnt_epi8 // CHECK: @llvm.ctpop.v32i8 // CHECK: select <32 x i1> %{{[0-9]+}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_popcnt_epi8(__U, __B); } +TEST_CONSTEXPR(match_v32qi(_mm256_maskz_popcnt_epi8(0x0FF0, (__m256i)(__v32qi){+5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73, +5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73}), 0, 0, 0, 0, 0, 4, 1, 4, 2, 2, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); __m128i test_mm_popcnt_epi8(__m128i __A) { // CHECK-LABEL: test_mm_popcnt_epi8 @@ -79,12 +88,15 @@ __m128i test_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) { // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_popcnt_epi8(__A, __U, __B); } +TEST_CONSTEXPR(match_v16qi(_mm_mask_popcnt_epi8(_mm_set1_epi8(-1), 0xF00F, (__m128i)(__v16qi){+5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73}), 2, 7, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, 6, 2, 4, 3)); + __m128i test_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) { // CHECK-LABEL: test_mm_maskz_popcnt_epi8 // CHECK: @llvm.ctpop.v16i8 // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_popcnt_epi8(__U, __B); } +TEST_CONSTEXPR(match_v16qi(_mm_maskz_popcnt_epi8(0x0FF0, (__m128i)(__v16qi){+5, -3, -10, +8, 0, -16, +16, -16, +3, +9, +15, +33, +63, +33, +53, +73}), 0, 0, 0, 0, 0, 4, 1, 4, 2, 2, 4, 2, 0, 0, 0, 0)); __mmask32 test_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_bitshuffle_epi64_mask diff --git a/clang/test/CodeGen/X86/avx512vlbw-builtins.c b/clang/test/CodeGen/X86/avx512vlbw-builtins.c index 4ec499caabf0..e792c15ba543 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/X86/avx512vlbw-builtins.c @@ -1,732 +1,735 @@ -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s -// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx10.1-512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s +// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx10.1-512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s +// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avx10.1-512 -emit-llvm -o - -Wall -Werror -Wsign-conversion | FileCheck %s #include __mmask32 test_mm256_cmpeq_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpeq_epi8_mask + // CHECK-LABEL: test_mm256_cmpeq_epi8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpeq_epi8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpeq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpeq_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmpeq_epi8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpeq_epi8_mask(__u, __a, __b); } __mmask16 test_mm_cmpeq_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpeq_epi8_mask + // CHECK-LABEL: test_mm_cmpeq_epi8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpeq_epi8_mask(__a, __b); } __mmask16 test_mm_mask_cmpeq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpeq_epi8_mask + // CHECK-LABEL: test_mm_mask_cmpeq_epi8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpeq_epi8_mask(__u, __a, __b); } __mmask16 test_mm256_cmpeq_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpeq_epi16_mask + // CHECK-LABEL: test_mm256_cmpeq_epi16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpeq_epi16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpeq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpeq_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmpeq_epi16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpeq_epi16_mask(__u, __a, __b); } __mmask8 test_mm_cmpeq_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpeq_epi16_mask + // CHECK-LABEL: test_mm_cmpeq_epi16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpeq_epi16_mask(__a, __b); } __mmask8 test_mm_mask_cmpeq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpeq_epi16_mask + // CHECK-LABEL: test_mm_mask_cmpeq_epi16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpeq_epi16_mask(__u, __a, __b); } __mmask32 test_mm256_cmpgt_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpgt_epi8_mask + // CHECK-LABEL: test_mm256_cmpgt_epi8_mask // CHECK: icmp sgt <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpgt_epi8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpgt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpgt_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmpgt_epi8_mask // CHECK: icmp sgt <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpgt_epi8_mask(__u, __a, __b); } __mmask16 test_mm_cmpgt_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpgt_epi8_mask + // CHECK-LABEL: test_mm_cmpgt_epi8_mask // CHECK: icmp sgt <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpgt_epi8_mask(__a, __b); } __mmask16 test_mm_mask_cmpgt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpgt_epi8_mask + // CHECK-LABEL: test_mm_mask_cmpgt_epi8_mask // CHECK: icmp sgt <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpgt_epi8_mask(__u, __a, __b); } __mmask16 test_mm256_cmpgt_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpgt_epi16_mask + // CHECK-LABEL: test_mm256_cmpgt_epi16_mask // CHECK: icmp sgt <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpgt_epi16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpgt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpgt_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmpgt_epi16_mask // CHECK: icmp sgt <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpgt_epi16_mask(__u, __a, __b); } __mmask8 test_mm_cmpgt_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpgt_epi16_mask + // CHECK-LABEL: test_mm_cmpgt_epi16_mask // CHECK: icmp sgt <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpgt_epi16_mask(__a, __b); } __mmask8 test_mm_mask_cmpgt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpgt_epi16_mask + // CHECK-LABEL: test_mm_mask_cmpgt_epi16_mask // CHECK: icmp sgt <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpgt_epi16_mask(__u, __a, __b); } __mmask16 test_mm_cmpeq_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpeq_epu8_mask + // CHECK-LABEL: test_mm_cmpeq_epu8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpeq_epu8_mask(__a, __b); } __mmask16 test_mm_mask_cmpeq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpeq_epu8_mask + // CHECK-LABEL: test_mm_mask_cmpeq_epu8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpeq_epu8_mask(__u, __a, __b); } __mmask8 test_mm_cmpeq_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpeq_epu16_mask + // CHECK-LABEL: test_mm_cmpeq_epu16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpeq_epu16_mask(__a, __b); } __mmask8 test_mm_mask_cmpeq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpeq_epu16_mask + // CHECK-LABEL: test_mm_mask_cmpeq_epu16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpeq_epu16_mask(__u, __a, __b); } __mmask32 test_mm256_cmpeq_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpeq_epu8_mask + // CHECK-LABEL: test_mm256_cmpeq_epu8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpeq_epu8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpeq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpeq_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmpeq_epu8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpeq_epu8_mask(__u, __a, __b); } __mmask16 test_mm256_cmpeq_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpeq_epu16_mask + // CHECK-LABEL: test_mm256_cmpeq_epu16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpeq_epu16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpeq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpeq_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmpeq_epu16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpeq_epu16_mask(__u, __a, __b); } __mmask16 test_mm_cmpgt_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpgt_epu8_mask + // CHECK-LABEL: test_mm_cmpgt_epu8_mask // CHECK: icmp ugt <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpgt_epu8_mask(__a, __b); } __mmask16 test_mm_mask_cmpgt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpgt_epu8_mask + // CHECK-LABEL: test_mm_mask_cmpgt_epu8_mask // CHECK: icmp ugt <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpgt_epu8_mask(__u, __a, __b); } __mmask8 test_mm_cmpgt_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpgt_epu16_mask + // CHECK-LABEL: test_mm_cmpgt_epu16_mask // CHECK: icmp ugt <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpgt_epu16_mask(__a, __b); } __mmask8 test_mm_mask_cmpgt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpgt_epu16_mask + // CHECK-LABEL: test_mm_mask_cmpgt_epu16_mask // CHECK: icmp ugt <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpgt_epu16_mask(__u, __a, __b); } __mmask32 test_mm256_cmpgt_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpgt_epu8_mask + // CHECK-LABEL: test_mm256_cmpgt_epu8_mask // CHECK: icmp ugt <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpgt_epu8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpgt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpgt_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmpgt_epu8_mask // CHECK: icmp ugt <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpgt_epu8_mask(__u, __a, __b); } __mmask16 test_mm256_cmpgt_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpgt_epu16_mask + // CHECK-LABEL: test_mm256_cmpgt_epu16_mask // CHECK: icmp ugt <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpgt_epu16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpgt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpgt_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmpgt_epu16_mask // CHECK: icmp ugt <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpgt_epu16_mask(__u, __a, __b); } __mmask16 test_mm_cmpge_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpge_epi8_mask + // CHECK-LABEL: test_mm_cmpge_epi8_mask // CHECK: icmp sge <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpge_epi8_mask(__a, __b); } __mmask16 test_mm_mask_cmpge_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpge_epi8_mask + // CHECK-LABEL: test_mm_mask_cmpge_epi8_mask // CHECK: icmp sge <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpge_epi8_mask(__u, __a, __b); } __mmask16 test_mm_cmpge_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpge_epu8_mask + // CHECK-LABEL: test_mm_cmpge_epu8_mask // CHECK: icmp uge <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpge_epu8_mask(__a, __b); } __mmask16 test_mm_mask_cmpge_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpge_epu8_mask + // CHECK-LABEL: test_mm_mask_cmpge_epu8_mask // CHECK: icmp uge <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpge_epu8_mask(__u, __a, __b); } __mmask8 test_mm_cmpge_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpge_epi16_mask + // CHECK-LABEL: test_mm_cmpge_epi16_mask // CHECK: icmp sge <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpge_epi16_mask(__a, __b); } __mmask8 test_mm_mask_cmpge_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpge_epi16_mask + // CHECK-LABEL: test_mm_mask_cmpge_epi16_mask // CHECK: icmp sge <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpge_epi16_mask(__u, __a, __b); } __mmask8 test_mm_cmpge_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpge_epu16_mask + // CHECK-LABEL: test_mm_cmpge_epu16_mask // CHECK: icmp uge <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpge_epu16_mask(__a, __b); } __mmask8 test_mm_mask_cmpge_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpge_epu16_mask + // CHECK-LABEL: test_mm_mask_cmpge_epu16_mask // CHECK: icmp uge <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpge_epu16_mask(__u, __a, __b); } __mmask32 test_mm256_cmpge_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpge_epi8_mask + // CHECK-LABEL: test_mm256_cmpge_epi8_mask // CHECK: icmp sge <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpge_epi8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpge_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpge_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmpge_epi8_mask // CHECK: icmp sge <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpge_epi8_mask(__u, __a, __b); } __mmask32 test_mm256_cmpge_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpge_epu8_mask + // CHECK-LABEL: test_mm256_cmpge_epu8_mask // CHECK: icmp uge <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpge_epu8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpge_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpge_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmpge_epu8_mask // CHECK: icmp uge <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpge_epu8_mask(__u, __a, __b); } __mmask16 test_mm256_cmpge_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpge_epi16_mask + // CHECK-LABEL: test_mm256_cmpge_epi16_mask // CHECK: icmp sge <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpge_epi16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpge_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpge_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmpge_epi16_mask // CHECK: icmp sge <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpge_epi16_mask(__u, __a, __b); } __mmask16 test_mm256_cmpge_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpge_epu16_mask + // CHECK-LABEL: test_mm256_cmpge_epu16_mask // CHECK: icmp uge <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpge_epu16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpge_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpge_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmpge_epu16_mask // CHECK: icmp uge <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpge_epu16_mask(__u, __a, __b); } __mmask16 test_mm_cmple_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmple_epi8_mask + // CHECK-LABEL: test_mm_cmple_epi8_mask // CHECK: icmp sle <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmple_epi8_mask(__a, __b); } __mmask16 test_mm_mask_cmple_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmple_epi8_mask + // CHECK-LABEL: test_mm_mask_cmple_epi8_mask // CHECK: icmp sle <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmple_epi8_mask(__u, __a, __b); } __mmask16 test_mm_cmple_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmple_epu8_mask + // CHECK-LABEL: test_mm_cmple_epu8_mask // CHECK: icmp ule <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmple_epu8_mask(__a, __b); } __mmask16 test_mm_mask_cmple_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmple_epu8_mask + // CHECK-LABEL: test_mm_mask_cmple_epu8_mask // CHECK: icmp ule <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmple_epu8_mask(__u, __a, __b); } __mmask8 test_mm_cmple_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmple_epi16_mask + // CHECK-LABEL: test_mm_cmple_epi16_mask // CHECK: icmp sle <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmple_epi16_mask(__a, __b); } __mmask8 test_mm_mask_cmple_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmple_epi16_mask + // CHECK-LABEL: test_mm_mask_cmple_epi16_mask // CHECK: icmp sle <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmple_epi16_mask(__u, __a, __b); } __mmask8 test_mm_cmple_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmple_epu16_mask + // CHECK-LABEL: test_mm_cmple_epu16_mask // CHECK: icmp ule <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmple_epu16_mask(__a, __b); } __mmask8 test_mm_mask_cmple_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmple_epu16_mask + // CHECK-LABEL: test_mm_mask_cmple_epu16_mask // CHECK: icmp ule <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmple_epu16_mask(__u, __a, __b); } __mmask32 test_mm256_cmple_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmple_epi8_mask + // CHECK-LABEL: test_mm256_cmple_epi8_mask // CHECK: icmp sle <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmple_epi8_mask(__a, __b); } __mmask32 test_mm256_mask_cmple_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmple_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmple_epi8_mask // CHECK: icmp sle <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmple_epi8_mask(__u, __a, __b); } __mmask32 test_mm256_cmple_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmple_epu8_mask + // CHECK-LABEL: test_mm256_cmple_epu8_mask // CHECK: icmp ule <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmple_epu8_mask(__a, __b); } __mmask32 test_mm256_mask_cmple_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmple_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmple_epu8_mask // CHECK: icmp ule <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmple_epu8_mask(__u, __a, __b); } __mmask16 test_mm256_cmple_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmple_epi16_mask + // CHECK-LABEL: test_mm256_cmple_epi16_mask // CHECK: icmp sle <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmple_epi16_mask(__a, __b); } __mmask16 test_mm256_mask_cmple_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmple_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmple_epi16_mask // CHECK: icmp sle <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmple_epi16_mask(__u, __a, __b); } __mmask16 test_mm256_cmple_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmple_epu16_mask + // CHECK-LABEL: test_mm256_cmple_epu16_mask // CHECK: icmp ule <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmple_epu16_mask(__a, __b); } __mmask16 test_mm256_mask_cmple_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmple_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmple_epu16_mask // CHECK: icmp ule <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmple_epu16_mask(__u, __a, __b); } __mmask16 test_mm_cmplt_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmplt_epi8_mask + // CHECK-LABEL: test_mm_cmplt_epi8_mask // CHECK: icmp slt <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmplt_epi8_mask(__a, __b); } __mmask16 test_mm_mask_cmplt_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmplt_epi8_mask + // CHECK-LABEL: test_mm_mask_cmplt_epi8_mask // CHECK: icmp slt <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmplt_epi8_mask(__u, __a, __b); } __mmask16 test_mm_cmplt_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmplt_epu8_mask + // CHECK-LABEL: test_mm_cmplt_epu8_mask // CHECK: icmp ult <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmplt_epu8_mask(__a, __b); } __mmask16 test_mm_mask_cmplt_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmplt_epu8_mask + // CHECK-LABEL: test_mm_mask_cmplt_epu8_mask // CHECK: icmp ult <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmplt_epu8_mask(__u, __a, __b); } __mmask8 test_mm_cmplt_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmplt_epi16_mask + // CHECK-LABEL: test_mm_cmplt_epi16_mask // CHECK: icmp slt <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmplt_epi16_mask(__a, __b); } __mmask8 test_mm_mask_cmplt_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmplt_epi16_mask + // CHECK-LABEL: test_mm_mask_cmplt_epi16_mask // CHECK: icmp slt <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmplt_epi16_mask(__u, __a, __b); } __mmask8 test_mm_cmplt_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmplt_epu16_mask + // CHECK-LABEL: test_mm_cmplt_epu16_mask // CHECK: icmp ult <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmplt_epu16_mask(__a, __b); } __mmask8 test_mm_mask_cmplt_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmplt_epu16_mask + // CHECK-LABEL: test_mm_mask_cmplt_epu16_mask // CHECK: icmp ult <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmplt_epu16_mask(__u, __a, __b); } __mmask32 test_mm256_cmplt_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmplt_epi8_mask + // CHECK-LABEL: test_mm256_cmplt_epi8_mask // CHECK: icmp slt <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmplt_epi8_mask(__a, __b); } __mmask32 test_mm256_mask_cmplt_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmplt_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmplt_epi8_mask // CHECK: icmp slt <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmplt_epi8_mask(__u, __a, __b); } __mmask32 test_mm256_cmplt_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmplt_epu8_mask + // CHECK-LABEL: test_mm256_cmplt_epu8_mask // CHECK: icmp ult <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmplt_epu8_mask(__a, __b); } __mmask32 test_mm256_mask_cmplt_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmplt_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmplt_epu8_mask // CHECK: icmp ult <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmplt_epu8_mask(__u, __a, __b); } __mmask16 test_mm256_cmplt_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmplt_epi16_mask + // CHECK-LABEL: test_mm256_cmplt_epi16_mask // CHECK: icmp slt <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmplt_epi16_mask(__a, __b); } __mmask16 test_mm256_mask_cmplt_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmplt_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmplt_epi16_mask // CHECK: icmp slt <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmplt_epi16_mask(__u, __a, __b); } __mmask16 test_mm256_cmplt_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmplt_epu16_mask + // CHECK-LABEL: test_mm256_cmplt_epu16_mask // CHECK: icmp ult <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmplt_epu16_mask(__a, __b); } __mmask16 test_mm256_mask_cmplt_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmplt_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmplt_epu16_mask // CHECK: icmp ult <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmplt_epu16_mask(__u, __a, __b); } __mmask16 test_mm_cmpneq_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpneq_epi8_mask + // CHECK-LABEL: test_mm_cmpneq_epi8_mask // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpneq_epi8_mask(__a, __b); } __mmask16 test_mm_mask_cmpneq_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpneq_epi8_mask + // CHECK-LABEL: test_mm_mask_cmpneq_epi8_mask // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpneq_epi8_mask(__u, __a, __b); } __mmask16 test_mm_cmpneq_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpneq_epu8_mask + // CHECK-LABEL: test_mm_cmpneq_epu8_mask // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmpneq_epu8_mask(__a, __b); } __mmask16 test_mm_mask_cmpneq_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpneq_epu8_mask + // CHECK-LABEL: test_mm_mask_cmpneq_epu8_mask // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmpneq_epu8_mask(__u, __a, __b); } __mmask8 test_mm_cmpneq_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpneq_epi16_mask + // CHECK-LABEL: test_mm_cmpneq_epi16_mask // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpneq_epi16_mask(__a, __b); } __mmask8 test_mm_mask_cmpneq_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpneq_epi16_mask + // CHECK-LABEL: test_mm_mask_cmpneq_epi16_mask // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpneq_epi16_mask(__u, __a, __b); } __mmask8 test_mm_cmpneq_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmpneq_epu16_mask + // CHECK-LABEL: test_mm_cmpneq_epu16_mask // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmpneq_epu16_mask(__a, __b); } __mmask8 test_mm_mask_cmpneq_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmpneq_epu16_mask + // CHECK-LABEL: test_mm_mask_cmpneq_epu16_mask // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmpneq_epu16_mask(__u, __a, __b); } __mmask32 test_mm256_cmpneq_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpneq_epi8_mask + // CHECK-LABEL: test_mm256_cmpneq_epi8_mask // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpneq_epi8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpneq_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpneq_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmpneq_epi8_mask // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpneq_epi8_mask(__u, __a, __b); } __mmask32 test_mm256_cmpneq_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpneq_epu8_mask + // CHECK-LABEL: test_mm256_cmpneq_epu8_mask // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmpneq_epu8_mask(__a, __b); } __mmask32 test_mm256_mask_cmpneq_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpneq_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmpneq_epu8_mask // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmpneq_epu8_mask(__u, __a, __b); } __mmask16 test_mm256_cmpneq_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpneq_epi16_mask + // CHECK-LABEL: test_mm256_cmpneq_epi16_mask // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpneq_epi16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpneq_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpneq_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmpneq_epi16_mask // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpneq_epi16_mask(__u, __a, __b); } __mmask16 test_mm256_cmpneq_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmpneq_epu16_mask + // CHECK-LABEL: test_mm256_cmpneq_epu16_mask // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmpneq_epu16_mask(__a, __b); } __mmask16 test_mm256_mask_cmpneq_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmpneq_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmpneq_epu16_mask // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmpneq_epu16_mask(__u, __a, __b); } __mmask16 test_mm_cmp_epi8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmp_epi8_mask + // CHECK-LABEL: test_mm_cmp_epi8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmp_epi8_mask(__a, __b, 0); } __mmask16 test_mm_mask_cmp_epi8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmp_epi8_mask + // CHECK-LABEL: test_mm_mask_cmp_epi8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmp_epi8_mask(__u, __a, __b, 0); } __mmask16 test_mm_cmp_epu8_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmp_epu8_mask + // CHECK-LABEL: test_mm_cmp_epu8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} return (__mmask16)_mm_cmp_epu8_mask(__a, __b, 0); } __mmask16 test_mm_mask_cmp_epu8_mask(__mmask16 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmp_epu8_mask + // CHECK-LABEL: test_mm_mask_cmp_epu8_mask // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm_mask_cmp_epu8_mask(__u, __a, __b, 0); } __mmask8 test_mm_cmp_epi16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmp_epi16_mask + // CHECK-LABEL: test_mm_cmp_epi16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmp_epi16_mask(__a, __b, 0); } __mmask8 test_mm_mask_cmp_epi16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmp_epi16_mask + // CHECK-LABEL: test_mm_mask_cmp_epi16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmp_epi16_mask(__u, __a, __b, 0); } __mmask8 test_mm_cmp_epu16_mask(__m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_cmp_epu16_mask + // CHECK-LABEL: test_mm_cmp_epu16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} return (__mmask8)_mm_cmp_epu16_mask(__a, __b, 0); } __mmask8 test_mm_mask_cmp_epu16_mask(__mmask8 __u, __m128i __a, __m128i __b) { - // CHECK-LABEL: @test_mm_mask_cmp_epu16_mask + // CHECK-LABEL: test_mm_mask_cmp_epu16_mask // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} return (__mmask8)_mm_mask_cmp_epu16_mask(__u, __a, __b, 0); } __mmask32 test_mm256_cmp_epi8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmp_epi8_mask + // CHECK-LABEL: test_mm256_cmp_epi8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmp_epi8_mask(__a, __b, 0); } __mmask32 test_mm256_mask_cmp_epi8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmp_epi8_mask + // CHECK-LABEL: test_mm256_mask_cmp_epi8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmp_epi8_mask(__u, __a, __b, 0); } __mmask32 test_mm256_cmp_epu8_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmp_epu8_mask + // CHECK-LABEL: test_mm256_cmp_epu8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} return (__mmask32)_mm256_cmp_epu8_mask(__a, __b, 0); } __mmask32 test_mm256_mask_cmp_epu8_mask(__mmask32 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmp_epu8_mask + // CHECK-LABEL: test_mm256_mask_cmp_epu8_mask // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} return (__mmask32)_mm256_mask_cmp_epu8_mask(__u, __a, __b, 0); } __mmask16 test_mm256_cmp_epi16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmp_epi16_mask + // CHECK-LABEL: test_mm256_cmp_epi16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmp_epi16_mask(__a, __b, 0); } __mmask16 test_mm256_mask_cmp_epi16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmp_epi16_mask + // CHECK-LABEL: test_mm256_mask_cmp_epi16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmp_epi16_mask(__u, __a, __b, 0); } __mmask16 test_mm256_cmp_epu16_mask(__m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_cmp_epu16_mask + // CHECK-LABEL: test_mm256_cmp_epu16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} return (__mmask16)_mm256_cmp_epu16_mask(__a, __b, 0); } __mmask16 test_mm256_mask_cmp_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b) { - // CHECK-LABEL: @test_mm256_mask_cmp_epu16_mask + // CHECK-LABEL: test_mm256_mask_cmp_epu16_mask // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} return (__mmask16)_mm256_mask_cmp_epu16_mask(__u, __a, __b, 0); @@ -734,139 +737,139 @@ __mmask16 test_mm256_mask_cmp_epu16_mask(__mmask16 __u, __m256i __a, __m256i __b __m256i test_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ - //CHECK-LABEL: @test_mm256_mask_add_epi8 + //CHECK-LABEL: test_mm256_mask_add_epi8 //CHECK: add <32 x i8> %{{.*}}, %{{.*}} //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_add_epi8(__W, __U , __A, __B); } __m256i test_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_maskz_add_epi8 + //CHECK-LABEL: test_mm256_maskz_add_epi8 //CHECK: add <32 x i8> %{{.*}}, %{{.*}} //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_add_epi8(__U , __A, __B); } __m256i test_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_mask_add_epi16 + //CHECK-LABEL: test_mm256_mask_add_epi16 //CHECK: add <16 x i16> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_add_epi16(__W, __U , __A, __B); } __m256i test_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_maskz_add_epi16 + //CHECK-LABEL: test_mm256_maskz_add_epi16 //CHECK: add <16 x i16> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_add_epi16(__U , __A, __B); } __m256i test_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_mask_sub_epi8 + //CHECK-LABEL: test_mm256_mask_sub_epi8 //CHECK: sub <32 x i8> %{{.*}}, %{{.*}} //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_sub_epi8(__W, __U , __A, __B); } __m256i test_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_maskz_sub_epi8 + //CHECK-LABEL: test_mm256_maskz_sub_epi8 //CHECK: sub <32 x i8> %{{.*}}, %{{.*}} //CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_sub_epi8(__U , __A, __B); } __m256i test_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_mask_sub_epi16 + //CHECK-LABEL: test_mm256_mask_sub_epi16 //CHECK: sub <16 x i16> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_sub_epi16(__W, __U , __A, __B); } __m256i test_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_maskz_sub_epi16 + //CHECK-LABEL: test_mm256_maskz_sub_epi16 //CHECK: sub <16 x i16> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_sub_epi16(__U , __A, __B); } __m128i test_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_mask_add_epi8 + //CHECK-LABEL: test_mm_mask_add_epi8 //CHECK: add <16 x i8> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_add_epi8(__W, __U , __A, __B); } __m128i test_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_maskz_add_epi8 + //CHECK-LABEL: test_mm_maskz_add_epi8 //CHECK: add <16 x i8> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_add_epi8(__U , __A, __B); } __m128i test_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_mask_add_epi16 + //CHECK-LABEL: test_mm_mask_add_epi16 //CHECK: add <8 x i16> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_add_epi16(__W, __U , __A, __B); } __m128i test_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_maskz_add_epi16 + //CHECK-LABEL: test_mm_maskz_add_epi16 //CHECK: add <8 x i16> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_add_epi16(__U , __A, __B); } __m128i test_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_mask_sub_epi8 + //CHECK-LABEL: test_mm_mask_sub_epi8 //CHECK: sub <16 x i8> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_sub_epi8(__W, __U , __A, __B); } __m128i test_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_maskz_sub_epi8 + //CHECK-LABEL: test_mm_maskz_sub_epi8 //CHECK: sub <16 x i8> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_sub_epi8(__U , __A, __B); } __m128i test_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_mask_sub_epi16 + //CHECK-LABEL: test_mm_mask_sub_epi16 //CHECK: sub <8 x i16> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_sub_epi16(__W, __U , __A, __B); } __m128i test_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_maskz_sub_epi16 + //CHECK-LABEL: test_mm_maskz_sub_epi16 //CHECK: sub <8 x i16> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_sub_epi16(__U , __A, __B); } __m256i test_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_mask_mullo_epi16 + //CHECK-LABEL: test_mm256_mask_mullo_epi16 //CHECK: mul <16 x i16> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_mullo_epi16(__W, __U , __A, __B); } __m256i test_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) { - //CHECK-LABEL: @test_mm256_maskz_mullo_epi16 + //CHECK-LABEL: test_mm256_maskz_mullo_epi16 //CHECK: mul <16 x i16> %{{.*}}, %{{.*}} //CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_mullo_epi16(__U , __A, __B); } __m128i test_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_mask_mullo_epi16 + //CHECK-LABEL: test_mm_mask_mullo_epi16 //CHECK: mul <8 x i16> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_mullo_epi16(__W, __U , __A, __B); } __m128i test_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { - //CHECK-LABEL: @test_mm_maskz_mullo_epi16 + //CHECK-LABEL: test_mm_maskz_mullo_epi16 //CHECK: mul <8 x i16> %{{.*}}, %{{.*}} //CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_mullo_epi16(__U , __A, __B); @@ -874,30 +877,30 @@ __m128i test_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) { __m128i test_mm_mask_blend_epi8(__mmask16 __U, __m128i __A, __m128i __W) { - // CHECK-LABEL: @test_mm_mask_blend_epi8 + // CHECK-LABEL: test_mm_mask_blend_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_blend_epi8(__U,__A,__W); } __m256i test_mm256_mask_blend_epi8(__mmask32 __U, __m256i __A, __m256i __W) { - // CHECK-LABEL: @test_mm256_mask_blend_epi8 + // CHECK-LABEL: test_mm256_mask_blend_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_blend_epi8(__U,__A,__W); } __m128i test_mm_mask_blend_epi16(__mmask8 __U, __m128i __A, __m128i __W) { - // CHECK-LABEL: @test_mm_mask_blend_epi16 + // CHECK-LABEL: test_mm_mask_blend_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_blend_epi16(__U,__A,__W); } __m256i test_mm256_mask_blend_epi16(__mmask16 __U, __m256i __A, __m256i __W) { - // CHECK-LABEL: @test_mm256_mask_blend_epi16 + // CHECK-LABEL: test_mm256_mask_blend_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_blend_epi16(__U,__A,__W); } __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_abs_epi8 + // CHECK-LABEL: test_mm_mask_abs_epi8 // CHECK: [[ABS:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <16 x i8> [[ABS]] to <2 x i64> // CHECK: [[ABS:%.*]] = bitcast <2 x i64> [[TMP]] to <16 x i8> @@ -906,7 +909,7 @@ __m128i test_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) { } __m128i test_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_abs_epi8 + // CHECK-LABEL: test_mm_maskz_abs_epi8 // CHECK: [[ABS:%.*]] = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <16 x i8> [[ABS]] to <2 x i64> // CHECK: [[ABS:%.*]] = bitcast <2 x i64> [[TMP]] to <16 x i8> @@ -915,7 +918,7 @@ __m128i test_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) { } __m256i test_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_abs_epi8 + // CHECK-LABEL: test_mm256_mask_abs_epi8 // CHECK: [[ABS:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <32 x i8> [[ABS]] to <4 x i64> // CHECK: [[ABS:%.*]] = bitcast <4 x i64> [[TMP]] to <32 x i8> @@ -924,7 +927,7 @@ __m256i test_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) { } __m256i test_mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_abs_epi8 + // CHECK-LABEL: test_mm256_maskz_abs_epi8 // CHECK: [[ABS:%.*]] = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <32 x i8> [[ABS]] to <4 x i64> // CHECK: [[ABS:%.*]] = bitcast <4 x i64> [[TMP]] to <32 x i8> @@ -933,7 +936,7 @@ __m256i test_mm256_maskz_abs_epi8(__mmask32 __U, __m256i __A) { } __m128i test_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_abs_epi16 + // CHECK-LABEL: test_mm_mask_abs_epi16 // CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <8 x i16> [[ABS]] to <2 x i64> // CHECK: [[ABS:%.*]] = bitcast <2 x i64> [[TMP]] to <8 x i16> @@ -942,7 +945,7 @@ __m128i test_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) { } __m128i test_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_abs_epi16 + // CHECK-LABEL: test_mm_maskz_abs_epi16 // CHECK: [[ABS:%.*]] = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <8 x i16> [[ABS]] to <2 x i64> // CHECK: [[ABS:%.*]] = bitcast <2 x i64> [[TMP]] to <8 x i16> @@ -951,7 +954,7 @@ __m128i test_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) { } __m256i test_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_abs_epi16 + // CHECK-LABEL: test_mm256_mask_abs_epi16 // CHECK: [[ABS:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <16 x i16> [[ABS]] to <4 x i64> // CHECK: [[ABS:%.*]] = bitcast <4 x i64> [[TMP]] to <16 x i16> @@ -960,7 +963,7 @@ __m256i test_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) { } __m256i test_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_abs_epi16 + // CHECK-LABEL: test_mm256_maskz_abs_epi16 // CHECK: [[ABS:%.*]] = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %{{.*}}, i1 false) // CHECK: [[TMP:%.*]] = bitcast <16 x i16> [[ABS]] to <4 x i64> // CHECK: [[ABS:%.*]] = bitcast <4 x i64> [[TMP]] to <16 x i16> @@ -969,264 +972,264 @@ __m256i test_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) { } __m128i test_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_packs_epi32 + // CHECK-LABEL: test_mm_maskz_packs_epi32 // CHECK: @llvm.x86.sse2.packssdw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packs_epi32(__M,__A,__B); } __m128i test_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_packs_epi32 + // CHECK-LABEL: test_mm_mask_packs_epi32 // CHECK: @llvm.x86.sse2.packssdw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_packs_epi32(__W,__M,__A,__B); } __m256i test_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_packs_epi32 + // CHECK-LABEL: test_mm256_maskz_packs_epi32 // CHECK: @llvm.x86.avx2.packssdw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_packs_epi32(__M,__A,__B); } __m256i test_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_packs_epi32 + // CHECK-LABEL: test_mm256_mask_packs_epi32 // CHECK: @llvm.x86.avx2.packssdw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_packs_epi32(__W,__M,__A,__B); } __m128i test_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_packs_epi16 + // CHECK-LABEL: test_mm_maskz_packs_epi16 // CHECK: @llvm.x86.sse2.packsswb // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_packs_epi16(__M,__A,__B); } __m128i test_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_packs_epi16 + // CHECK-LABEL: test_mm_mask_packs_epi16 // CHECK: @llvm.x86.sse2.packsswb // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_packs_epi16(__W,__M,__A,__B); } __m256i test_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_packs_epi16 + // CHECK-LABEL: test_mm256_maskz_packs_epi16 // CHECK: @llvm.x86.avx2.packsswb // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_packs_epi16(__M,__A,__B); } __m256i test_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_packs_epi16 + // CHECK-LABEL: test_mm256_mask_packs_epi16 // CHECK: @llvm.x86.avx2.packsswb // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_packs_epi16(__W,__M,__A,__B); } __m128i test_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_packus_epi32 + // CHECK-LABEL: test_mm_mask_packus_epi32 // CHECK: @llvm.x86.sse41.packusdw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_packus_epi32(__W,__M,__A,__B); } __m128i test_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_packus_epi32 + // CHECK-LABEL: test_mm_maskz_packus_epi32 // CHECK: @llvm.x86.sse41.packusdw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_packus_epi32(__M,__A,__B); } __m256i test_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_packus_epi32 + // CHECK-LABEL: test_mm256_maskz_packus_epi32 // CHECK: @llvm.x86.avx2.packusdw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_packus_epi32(__M,__A,__B); } __m256i test_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_packus_epi32 + // CHECK-LABEL: test_mm256_mask_packus_epi32 // CHECK: @llvm.x86.avx2.packusdw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_packus_epi32(__W,__M,__A,__B); } __m128i test_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_packus_epi16 + // CHECK-LABEL: test_mm_maskz_packus_epi16 // CHECK: @llvm.x86.sse2.packuswb // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_packus_epi16(__M,__A,__B); } __m128i test_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_packus_epi16 + // CHECK-LABEL: test_mm_mask_packus_epi16 // CHECK: @llvm.x86.sse2.packuswb // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_packus_epi16(__W,__M,__A,__B); } __m256i test_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_packus_epi16 + // CHECK-LABEL: test_mm256_maskz_packus_epi16 // CHECK: @llvm.x86.avx2.packuswb // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_packus_epi16(__M,__A,__B); } __m256i test_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_packus_epi16 + // CHECK-LABEL: test_mm256_mask_packus_epi16 // CHECK: @llvm.x86.avx2.packuswb // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_packus_epi16(__W,__M,__A,__B); } __m128i test_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_adds_epi8 + // CHECK-LABEL: test_mm_mask_adds_epi8 // CHECK: @llvm.sadd.sat.v16i8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_adds_epi8(__W,__U,__A,__B); } __m128i test_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_adds_epi8 + // CHECK-LABEL: test_mm_maskz_adds_epi8 // CHECK: @llvm.sadd.sat.v16i8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_adds_epi8(__U,__A,__B); } __m256i test_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_adds_epi8 + // CHECK-LABEL: test_mm256_mask_adds_epi8 // CHECK: @llvm.sadd.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_adds_epi8(__W,__U,__A,__B); } __m256i test_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_adds_epi8 + // CHECK-LABEL: test_mm256_maskz_adds_epi8 // CHECK: @llvm.sadd.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_adds_epi8(__U,__A,__B); } __m128i test_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_adds_epi16 + // CHECK-LABEL: test_mm_mask_adds_epi16 // CHECK: @llvm.sadd.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_adds_epi16(__W,__U,__A,__B); } __m128i test_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_adds_epi16 + // CHECK-LABEL: test_mm_maskz_adds_epi16 // CHECK: @llvm.sadd.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_adds_epi16(__U,__A,__B); } __m256i test_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_adds_epi16 + // CHECK-LABEL: test_mm256_mask_adds_epi16 // CHECK: @llvm.sadd.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_adds_epi16(__W,__U,__A,__B); } __m256i test_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_adds_epi16 + // CHECK-LABEL: test_mm256_maskz_adds_epi16 // CHECK: @llvm.sadd.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_adds_epi16(__U,__A,__B); } __m128i test_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_adds_epu8 + // CHECK-LABEL: test_mm_mask_adds_epu8 // CHECK-NOT: @llvm.x86.sse2.paddus.b // CHECK: call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_adds_epu8(__W,__U,__A,__B); } __m128i test_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_adds_epu8 + // CHECK-LABEL: test_mm_maskz_adds_epu8 // CHECK-NOT: @llvm.x86.sse2.paddus.b // CHECK: call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_adds_epu8(__U,__A,__B); } __m256i test_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_adds_epu8 + // CHECK-LABEL: test_mm256_mask_adds_epu8 // CHECK-NOT: @llvm.x86.avx2.paddus.b // CHECK: call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_adds_epu8(__W,__U,__A,__B); } __m256i test_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_adds_epu8 + // CHECK-LABEL: test_mm256_maskz_adds_epu8 // CHECK-NOT: @llvm.x86.avx2.paddus.b // CHECK: call <32 x i8> @llvm.uadd.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_adds_epu8(__U,__A,__B); } __m128i test_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_adds_epu16 + // CHECK-LABEL: test_mm_mask_adds_epu16 // CHECK-NOT: @llvm.x86.sse2.paddus.w // CHECK: call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_adds_epu16(__W,__U,__A,__B); } __m128i test_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_adds_epu16 + // CHECK-LABEL: test_mm_maskz_adds_epu16 // CHECK-NOT: @llvm.x86.sse2.paddus.w // CHECK: call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_adds_epu16(__U,__A,__B); } __m256i test_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_adds_epu16 + // CHECK-LABEL: test_mm256_mask_adds_epu16 // CHECK-NOT: @llvm.x86.avx2.paddus.w // CHECK: call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_adds_epu16(__W,__U,__A,__B); } __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_adds_epu16 + // CHECK-LABEL: test_mm256_maskz_adds_epu16 // CHECK-NOT: @llvm.x86.avx2.paddus.w // CHECK: call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_adds_epu16(__U,__A,__B); } __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_avg_epu8 + // CHECK-LABEL: test_mm_mask_avg_epu8 // CHECK: @llvm.x86.sse2.pavg.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_avg_epu8(__W,__U,__A,__B); } __m128i test_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_avg_epu8 + // CHECK-LABEL: test_mm_maskz_avg_epu8 // CHECK: @llvm.x86.sse2.pavg.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_avg_epu8(__U,__A,__B); } __m256i test_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_avg_epu8 + // CHECK-LABEL: test_mm256_mask_avg_epu8 // CHECK: @llvm.x86.avx2.pavg.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_avg_epu8(__W,__U,__A,__B); } __m256i test_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_avg_epu8 + // CHECK-LABEL: test_mm256_maskz_avg_epu8 // CHECK: @llvm.x86.avx2.pavg.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_avg_epu8(__U,__A,__B); } __m128i test_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_avg_epu16 + // CHECK-LABEL: test_mm_mask_avg_epu16 // CHECK: @llvm.x86.sse2.pavg.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_avg_epu16(__W,__U,__A,__B); } __m128i test_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_avg_epu16 + // CHECK-LABEL: test_mm_maskz_avg_epu16 // CHECK: @llvm.x86.sse2.pavg.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_avg_epu16(__U,__A,__B); } __m256i test_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_avg_epu16 + // CHECK-LABEL: test_mm256_mask_avg_epu16 // CHECK: @llvm.x86.avx2.pavg.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_avg_epu16(__W,__U,__A,__B); } __m256i test_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_avg_epu16 + // CHECK-LABEL: test_mm256_maskz_avg_epu16 // CHECK: @llvm.x86.avx2.pavg.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_avg_epu16(__U,__A,__B); } __m128i test_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_max_epi8 + // CHECK-LABEL: test_mm_maskz_max_epi8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1234,7 +1237,7 @@ __m128i test_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return _mm_maskz_max_epi8(__M,__A,__B); } __m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_max_epi8 + // CHECK-LABEL: test_mm_mask_max_epi8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1242,7 +1245,7 @@ __m128i test_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _ return _mm_mask_max_epi8(__W,__M,__A,__B); } __m256i test_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_max_epi8 + // CHECK-LABEL: test_mm256_maskz_max_epi8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1250,7 +1253,7 @@ __m256i test_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return _mm256_maskz_max_epi8(__M,__A,__B); } __m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_max_epi8 + // CHECK-LABEL: test_mm256_mask_max_epi8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1258,7 +1261,7 @@ __m256i test_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256 return _mm256_mask_max_epi8(__W,__M,__A,__B); } __m128i test_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_max_epi16 + // CHECK-LABEL: test_mm_maskz_max_epi16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1266,7 +1269,7 @@ __m128i test_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return _mm_maskz_max_epi16(__M,__A,__B); } __m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_max_epi16 + // CHECK-LABEL: test_mm_mask_max_epi16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1274,7 +1277,7 @@ __m128i test_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _ return _mm_mask_max_epi16(__W,__M,__A,__B); } __m256i test_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_max_epi16 + // CHECK-LABEL: test_mm256_maskz_max_epi16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1282,7 +1285,7 @@ __m256i test_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) { return _mm256_maskz_max_epi16(__M,__A,__B); } __m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_max_epi16 + // CHECK-LABEL: test_mm256_mask_max_epi16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1290,7 +1293,7 @@ __m256i test_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m25 return _mm256_mask_max_epi16(__W,__M,__A,__B); } __m128i test_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_max_epu8 + // CHECK-LABEL: test_mm_maskz_max_epu8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1298,7 +1301,7 @@ __m128i test_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) { return _mm_maskz_max_epu8(__M,__A,__B); } __m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_max_epu8 + // CHECK-LABEL: test_mm_mask_max_epu8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1306,7 +1309,7 @@ __m128i test_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _ return _mm_mask_max_epu8(__W,__M,__A,__B); } __m256i test_mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_max_epu8 + // CHECK-LABEL: test_mm256_maskz_max_epu8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1314,7 +1317,7 @@ __m256i test_mm256_maskz_max_epu8(__mmask32 __M, __m256i __A, __m256i __B) { return _mm256_maskz_max_epu8(__M,__A,__B); } __m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_max_epu8 + // CHECK-LABEL: test_mm256_mask_max_epu8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umax.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1322,7 +1325,7 @@ __m256i test_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256 return _mm256_mask_max_epu8(__W,__M,__A,__B); } __m128i test_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_max_epu16 + // CHECK-LABEL: test_mm_maskz_max_epu16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1330,7 +1333,7 @@ __m128i test_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) { return _mm_maskz_max_epu16(__M,__A,__B); } __m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_max_epu16 + // CHECK-LABEL: test_mm_mask_max_epu16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umax.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1338,7 +1341,7 @@ __m128i test_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _ return _mm_mask_max_epu16(__W,__M,__A,__B); } __m256i test_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_max_epu16 + // CHECK-LABEL: test_mm256_maskz_max_epu16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1346,7 +1349,7 @@ __m256i test_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) { return _mm256_maskz_max_epu16(__M,__A,__B); } __m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_max_epu16 + // CHECK-LABEL: test_mm256_mask_max_epu16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umax.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1354,7 +1357,7 @@ __m256i test_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m25 return _mm256_mask_max_epu16(__W,__M,__A,__B); } __m128i test_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_min_epi8 + // CHECK-LABEL: test_mm_maskz_min_epi8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1362,7 +1365,7 @@ __m128i test_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) { return _mm_maskz_min_epi8(__M,__A,__B); } __m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_min_epi8 + // CHECK-LABEL: test_mm_mask_min_epi8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1370,7 +1373,7 @@ __m128i test_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _ return _mm_mask_min_epi8(__W,__M,__A,__B); } __m256i test_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_min_epi8 + // CHECK-LABEL: test_mm256_maskz_min_epi8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1378,7 +1381,7 @@ __m256i test_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) { return _mm256_maskz_min_epi8(__M,__A,__B); } __m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_min_epi8 + // CHECK-LABEL: test_mm256_mask_min_epi8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.smin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1386,7 +1389,7 @@ __m256i test_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256 return _mm256_mask_min_epi8(__W,__M,__A,__B); } __m128i test_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_min_epi16 + // CHECK-LABEL: test_mm_maskz_min_epi16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1394,7 +1397,7 @@ __m128i test_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) { return _mm_maskz_min_epi16(__M,__A,__B); } __m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_min_epi16 + // CHECK-LABEL: test_mm_mask_min_epi16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.smin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1402,7 +1405,7 @@ __m128i test_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _ return _mm_mask_min_epi16(__W,__M,__A,__B); } __m256i test_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_min_epi16 + // CHECK-LABEL: test_mm256_maskz_min_epi16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1410,7 +1413,7 @@ __m256i test_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) { return _mm256_maskz_min_epi16(__M,__A,__B); } __m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_min_epi16 + // CHECK-LABEL: test_mm256_mask_min_epi16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.smin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1418,7 +1421,7 @@ __m256i test_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m25 return _mm256_mask_min_epi16(__W,__M,__A,__B); } __m128i test_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_min_epu8 + // CHECK-LABEL: test_mm_maskz_min_epu8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1426,7 +1429,7 @@ __m128i test_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) { return _mm_maskz_min_epu8(__M,__A,__B); } __m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_min_epu8 + // CHECK-LABEL: test_mm_mask_min_epu8 // CHECK: [[RES:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i8>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1434,7 +1437,7 @@ __m128i test_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i _ return _mm_mask_min_epu8(__W,__M,__A,__B); } __m256i test_mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_min_epu8 + // CHECK-LABEL: test_mm256_maskz_min_epu8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1442,7 +1445,7 @@ __m256i test_mm256_maskz_min_epu8(__mmask32 __M, __m256i __A, __m256i __B) { return _mm256_maskz_min_epu8(__M,__A,__B); } __m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_min_epu8 + // CHECK-LABEL: test_mm256_mask_min_epu8 // CHECK: [[RES:%.*]] = call <32 x i8> @llvm.umin.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<32 x i8>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1450,7 +1453,7 @@ __m256i test_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256 return _mm256_mask_min_epu8(__W,__M,__A,__B); } __m128i test_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_min_epu16 + // CHECK-LABEL: test_mm_maskz_min_epu16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1458,7 +1461,7 @@ __m128i test_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) { return _mm_maskz_min_epu16(__M,__A,__B); } __m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_min_epu16 + // CHECK-LABEL: test_mm_mask_min_epu16 // CHECK: [[RES:%.*]] = call <8 x i16> @llvm.umin.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<8 x i16>]] [[RES]] to [[DSTTY:<2 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1466,7 +1469,7 @@ __m128i test_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i _ return _mm_mask_min_epu16(__W,__M,__A,__B); } __m256i test_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_min_epu16 + // CHECK-LABEL: test_mm256_maskz_min_epu16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1474,7 +1477,7 @@ __m256i test_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) { return _mm256_maskz_min_epu16(__M,__A,__B); } __m256i test_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_min_epu16 + // CHECK-LABEL: test_mm256_mask_min_epu16 // CHECK: [[RES:%.*]] = call <16 x i16> @llvm.umin.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: [[TMP:%.*]] = bitcast [[SRCTY:<16 x i16>]] [[RES]] to [[DSTTY:<4 x i64>]] // CHECK: [[RES:%.*]] = bitcast [[DSTTY]] [[TMP]] to [[SRCTY]] @@ -1482,128 +1485,128 @@ __m256i test_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m25 return _mm256_mask_min_epu16(__W,__M,__A,__B); } __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_shuffle_epi8 + // CHECK-LABEL: test_mm_mask_shuffle_epi8 // CHECK: @llvm.x86.ssse3.pshuf.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_shuffle_epi8(__W,__U,__A,__B); } __m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_shuffle_epi8 + // CHECK-LABEL: test_mm_maskz_shuffle_epi8 // CHECK: @llvm.x86.ssse3.pshuf.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_shuffle_epi8(__U,__A,__B); } __m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_shuffle_epi8 + // CHECK-LABEL: test_mm256_mask_shuffle_epi8 // CHECK: @llvm.x86.avx2.pshuf.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_shuffle_epi8(__W,__U,__A,__B); } __m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_shuffle_epi8 + // CHECK-LABEL: test_mm256_maskz_shuffle_epi8 // CHECK: @llvm.x86.avx2.pshuf.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_shuffle_epi8(__U,__A,__B); } __m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_subs_epi8 + // CHECK-LABEL: test_mm_mask_subs_epi8 // CHECK: @llvm.ssub.sat.v16i8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_subs_epi8(__W,__U,__A,__B); } __m128i test_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_subs_epi8 + // CHECK-LABEL: test_mm_maskz_subs_epi8 // CHECK: @llvm.ssub.sat.v16i8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_subs_epi8(__U,__A,__B); } __m256i test_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_subs_epi8 + // CHECK-LABEL: test_mm256_mask_subs_epi8 // CHECK: @llvm.ssub.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_subs_epi8(__W,__U,__A,__B); } __m256i test_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_subs_epi8 + // CHECK-LABEL: test_mm256_maskz_subs_epi8 // CHECK: @llvm.ssub.sat.v32i8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_subs_epi8(__U,__A,__B); } __m128i test_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_subs_epi16 + // CHECK-LABEL: test_mm_mask_subs_epi16 // CHECK: @llvm.ssub.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_subs_epi16(__W,__U,__A,__B); } __m128i test_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_subs_epi16 + // CHECK-LABEL: test_mm_maskz_subs_epi16 // CHECK: @llvm.ssub.sat.v8i16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_subs_epi16(__U,__A,__B); } __m256i test_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_subs_epi16 + // CHECK-LABEL: test_mm256_mask_subs_epi16 // CHECK: @llvm.ssub.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_subs_epi16(__W,__U,__A,__B); } __m256i test_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_subs_epi16 + // CHECK-LABEL: test_mm256_maskz_subs_epi16 // CHECK: @llvm.ssub.sat.v16i16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_subs_epi16(__U,__A,__B); } __m128i test_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_subs_epu8 + // CHECK-LABEL: test_mm_mask_subs_epu8 // CHECK-NOT: @llvm.x86.sse2.psubus.b // CHECK: call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_subs_epu8(__W,__U,__A,__B); } __m128i test_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_subs_epu8 + // CHECK-LABEL: test_mm_maskz_subs_epu8 // CHECK-NOT: @llvm.x86.sse2.psubus.b // CHECK: call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_subs_epu8(__U,__A,__B); } __m256i test_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_subs_epu8 + // CHECK-LABEL: test_mm256_mask_subs_epu8 // CHECK-NOT: @llvm.x86.avx2.psubus.b // CHECK: call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_subs_epu8(__W,__U,__A,__B); } __m256i test_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_subs_epu8 + // CHECK-LABEL: test_mm256_maskz_subs_epu8 // CHECK-NOT: @llvm.x86.avx2.psubus.b // CHECK: call <32 x i8> @llvm.usub.sat.v32i8(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_subs_epu8(__U,__A,__B); } __m128i test_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_subs_epu16 + // CHECK-LABEL: test_mm_mask_subs_epu16 // CHECK-NOT: @llvm.x86.sse2.psubus.w // CHECK: call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_subs_epu16(__W,__U,__A,__B); } __m128i test_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_subs_epu16 + // CHECK-LABEL: test_mm_maskz_subs_epu16 // CHECK-NOT: @llvm.x86.sse2.psubus.w // CHECK: call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_subs_epu16(__U,__A,__B); } __m256i test_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_subs_epu16 + // CHECK-LABEL: test_mm256_mask_subs_epu16 // CHECK-NOT: @llvm.x86.avx2.psubus.w // CHECK: call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_subs_epu16(__W,__U,__A,__B); } __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_subs_epu16 + // CHECK-LABEL: test_mm256_maskz_subs_epu16 // CHECK-NOT: @llvm.x86.avx2.psubus.w // CHECK: call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} @@ -1612,1019 +1615,1019 @@ __m256i test_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) { __m128i test_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, __m128i __B) { - // CHECK-LABEL: @test_mm_mask2_permutex2var_epi16 + // CHECK-LABEL: test_mm_mask2_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask2_permutex2var_epi16(__A,__I,__U,__B); } __m256i test_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask2_permutex2var_epi16 + // CHECK-LABEL: test_mm256_mask2_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask2_permutex2var_epi16(__A,__I,__U,__B); } __m128i test_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) { - // CHECK-LABEL: @test_mm_permutex2var_epi16 + // CHECK-LABEL: test_mm_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.128 return _mm_permutex2var_epi16(__A,__I,__B); } __m128i test_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_permutex2var_epi16 + // CHECK-LABEL: test_mm_mask_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_permutex2var_epi16(__A,__U,__I,__B); } __m128i test_mm_maskz_permutex2var_epi16(__mmask8 __U, __m128i __A, __m128i __I, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_permutex2var_epi16 + // CHECK-LABEL: test_mm_maskz_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_permutex2var_epi16(__U,__A,__I,__B); } __m256i test_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) { - // CHECK-LABEL: @test_mm256_permutex2var_epi16 + // CHECK-LABEL: test_mm256_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.256 return _mm256_permutex2var_epi16(__A,__I,__B); } __m256i test_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_permutex2var_epi16 + // CHECK-LABEL: test_mm256_mask_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_permutex2var_epi16(__A,__U,__I,__B); } __m256i test_mm256_maskz_permutex2var_epi16(__mmask16 __U, __m256i __A, __m256i __I, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_permutex2var_epi16 + // CHECK-LABEL: test_mm256_maskz_permutex2var_epi16 // CHECK: @llvm.x86.avx512.vpermi2var.hi.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_permutex2var_epi16(__U,__A,__I,__B); } __m128i test_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - // CHECK-LABEL: @test_mm_mask_maddubs_epi16 + // CHECK-LABEL: test_mm_mask_maddubs_epi16 // CHECK: @llvm.x86.ssse3.pmadd.ub.sw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_maddubs_epi16(__W, __U, __X, __Y); } __m128i test_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - // CHECK-LABEL: @test_mm_maskz_maddubs_epi16 + // CHECK-LABEL: test_mm_maskz_maddubs_epi16 // CHECK: @llvm.x86.ssse3.pmadd.ub.sw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_maddubs_epi16(__U, __X, __Y); } __m256i test_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { - // CHECK-LABEL: @test_mm256_mask_maddubs_epi16 + // CHECK-LABEL: test_mm256_mask_maddubs_epi16 // CHECK: @llvm.x86.avx2.pmadd.ub.sw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_maddubs_epi16(__W, __U, __X, __Y); } __m256i test_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { - // CHECK-LABEL: @test_mm256_maskz_maddubs_epi16 + // CHECK-LABEL: test_mm256_maskz_maddubs_epi16 // CHECK: @llvm.x86.avx2.pmadd.ub.sw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_maddubs_epi16(__U, __X, __Y); } __m128i test_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_madd_epi16 + // CHECK-LABEL: test_mm_mask_madd_epi16 // CHECK: @llvm.x86.sse2.pmadd.wd // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_madd_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_madd_epi16 + // CHECK-LABEL: test_mm_maskz_madd_epi16 // CHECK: @llvm.x86.sse2.pmadd.wd // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_madd_epi16(__U, __A, __B); } __m256i test_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_madd_epi16 + // CHECK-LABEL: test_mm256_mask_madd_epi16 // CHECK: @llvm.x86.avx2.pmadd.wd // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_madd_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_madd_epi16 + // CHECK-LABEL: test_mm256_maskz_madd_epi16 // CHECK: @llvm.x86.avx2.pmadd.wd // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_madd_epi16(__U, __A, __B); } __m128i test_mm_cvtsepi16_epi8(__m128i __A) { - // CHECK-LABEL: @test_mm_cvtsepi16_epi8 + // CHECK-LABEL: test_mm_cvtsepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.128 return _mm_cvtsepi16_epi8(__A); } __m128i test_mm_mask_cvtsepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_cvtsepi16_epi8 + // CHECK-LABEL: test_mm_mask_cvtsepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.128 return _mm_mask_cvtsepi16_epi8(__O, __M, __A); } __m128i test_mm_maskz_cvtsepi16_epi8(__mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_cvtsepi16_epi8 + // CHECK-LABEL: test_mm_maskz_cvtsepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.128 return _mm_maskz_cvtsepi16_epi8(__M, __A); } __m128i test_mm256_cvtsepi16_epi8(__m256i __A) { - // CHECK-LABEL: @test_mm256_cvtsepi16_epi8 + // CHECK-LABEL: test_mm256_cvtsepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.256 return _mm256_cvtsepi16_epi8(__A); } __m128i test_mm256_mask_cvtsepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_cvtsepi16_epi8 + // CHECK-LABEL: test_mm256_mask_cvtsepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.256 return _mm256_mask_cvtsepi16_epi8(__O, __M, __A); } __m128i test_mm256_maskz_cvtsepi16_epi8(__mmask16 __M, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_cvtsepi16_epi8 + // CHECK-LABEL: test_mm256_maskz_cvtsepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.256 return _mm256_maskz_cvtsepi16_epi8(__M, __A); } __m128i test_mm_cvtusepi16_epi8(__m128i __A) { - // CHECK-LABEL: @test_mm_cvtusepi16_epi8 + // CHECK-LABEL: test_mm_cvtusepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.128 return _mm_cvtusepi16_epi8(__A); } __m128i test_mm_mask_cvtusepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_cvtusepi16_epi8 + // CHECK-LABEL: test_mm_mask_cvtusepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.128 return _mm_mask_cvtusepi16_epi8(__O, __M, __A); } __m128i test_mm_maskz_cvtusepi16_epi8(__mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_cvtusepi16_epi8 + // CHECK-LABEL: test_mm_maskz_cvtusepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.128 return _mm_maskz_cvtusepi16_epi8(__M, __A); } __m128i test_mm256_cvtusepi16_epi8(__m256i __A) { - // CHECK-LABEL: @test_mm256_cvtusepi16_epi8 + // CHECK-LABEL: test_mm256_cvtusepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.256 return _mm256_cvtusepi16_epi8(__A); } __m128i test_mm256_mask_cvtusepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_cvtusepi16_epi8 + // CHECK-LABEL: test_mm256_mask_cvtusepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.256 return _mm256_mask_cvtusepi16_epi8(__O, __M, __A); } __m128i test_mm256_maskz_cvtusepi16_epi8(__mmask16 __M, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_cvtusepi16_epi8 + // CHECK-LABEL: test_mm256_maskz_cvtusepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.256 return _mm256_maskz_cvtusepi16_epi8(__M, __A); } __m128i test_mm_cvtepi16_epi8(__m128i __A) { - // CHECK-LABEL: @test_mm_cvtepi16_epi8 + // CHECK-LABEL: test_mm_cvtepi16_epi8 // CHECK: trunc <8 x i16> %{{.*}} to <8 x i8> // CHECK: shufflevector <8 x i8> %{{.*}}, <8 x i8> %{{.*}}, <16 x i32> return _mm_cvtepi16_epi8(__A); } __m128i test_mm_mask_cvtepi16_epi8(__m128i __O, __mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_cvtepi16_epi8 + // CHECK-LABEL: test_mm_mask_cvtepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 return _mm_mask_cvtepi16_epi8(__O, __M, __A); } __m128i test_mm_maskz_cvtepi16_epi8(__mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_cvtepi16_epi8 + // CHECK-LABEL: test_mm_maskz_cvtepi16_epi8 // CHECK: @llvm.x86.avx512.mask.pmov.wb.128 return _mm_maskz_cvtepi16_epi8(__M, __A); } __m128i test_mm256_cvtepi16_epi8(__m256i __A) { - // CHECK-LABEL: @test_mm256_cvtepi16_epi8 + // CHECK-LABEL: test_mm256_cvtepi16_epi8 // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> return _mm256_cvtepi16_epi8(__A); } __m128i test_mm256_mask_cvtepi16_epi8(__m128i __O, __mmask16 __M, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_cvtepi16_epi8 + // CHECK-LABEL: test_mm256_mask_cvtepi16_epi8 // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm256_mask_cvtepi16_epi8(__O, __M, __A); } __m128i test_mm256_maskz_cvtepi16_epi8(__mmask16 __M, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_cvtepi16_epi8 + // CHECK-LABEL: test_mm256_maskz_cvtepi16_epi8 // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm256_maskz_cvtepi16_epi8(__M, __A); } __m128i test_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - // CHECK-LABEL: @test_mm_mask_mulhrs_epi16 + // CHECK-LABEL: test_mm_mask_mulhrs_epi16 // CHECK: @llvm.x86.ssse3.pmul.hr.sw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_mulhrs_epi16(__W, __U, __X, __Y); } __m128i test_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - // CHECK-LABEL: @test_mm_maskz_mulhrs_epi16 + // CHECK-LABEL: test_mm_maskz_mulhrs_epi16 // CHECK: @llvm.x86.ssse3.pmul.hr.sw // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_mulhrs_epi16(__U, __X, __Y); } __m256i test_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { - // CHECK-LABEL: @test_mm256_mask_mulhrs_epi16 + // CHECK-LABEL: test_mm256_mask_mulhrs_epi16 // CHECK: @llvm.x86.avx2.pmul.hr.sw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_mulhrs_epi16(__W, __U, __X, __Y); } __m256i test_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { - // CHECK-LABEL: @test_mm256_maskz_mulhrs_epi16 + // CHECK-LABEL: test_mm256_maskz_mulhrs_epi16 // CHECK: @llvm.x86.avx2.pmul.hr.sw // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_mulhrs_epi16(__U, __X, __Y); } __m128i test_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_mulhi_epu16 + // CHECK-LABEL: test_mm_mask_mulhi_epu16 // CHECK: @llvm.x86.sse2.pmulhu.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_mulhi_epu16(__W, __U, __A, __B); } __m128i test_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_mulhi_epu16 + // CHECK-LABEL: test_mm_maskz_mulhi_epu16 // CHECK: @llvm.x86.sse2.pmulhu.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_mulhi_epu16(__U, __A, __B); } __m256i test_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_mulhi_epu16 + // CHECK-LABEL: test_mm256_mask_mulhi_epu16 // CHECK: @llvm.x86.avx2.pmulhu.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_mulhi_epu16(__W, __U, __A, __B); } __m256i test_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_mulhi_epu16 + // CHECK-LABEL: test_mm256_maskz_mulhi_epu16 // CHECK: @llvm.x86.avx2.pmulhu.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_mulhi_epu16(__U, __A, __B); } __m128i test_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_mulhi_epi16 + // CHECK-LABEL: test_mm_mask_mulhi_epi16 // CHECK: @llvm.x86.sse2.pmulh.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_mulhi_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_mulhi_epi16 + // CHECK-LABEL: test_mm_maskz_mulhi_epi16 // CHECK: @llvm.x86.sse2.pmulh.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_mulhi_epi16(__U, __A, __B); } __m256i test_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_mulhi_epi16 + // CHECK-LABEL: test_mm256_mask_mulhi_epi16 // CHECK: @llvm.x86.avx2.pmulh.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_mulhi_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_mulhi_epi16 + // CHECK-LABEL: test_mm256_maskz_mulhi_epi16 // CHECK: @llvm.x86.avx2.pmulh.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_mulhi_epi16(__U, __A, __B); } __m128i test_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_unpackhi_epi8 + // CHECK-LABEL: test_mm_mask_unpackhi_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_unpackhi_epi8(__W, __U, __A, __B); } __m128i test_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_unpackhi_epi8 + // CHECK-LABEL: test_mm_maskz_unpackhi_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_unpackhi_epi8(__U, __A, __B); } __m256i test_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_unpackhi_epi8 + // CHECK-LABEL: test_mm256_mask_unpackhi_epi8 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_unpackhi_epi8(__W, __U, __A, __B); } __m256i test_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi8 + // CHECK-LABEL: test_mm256_maskz_unpackhi_epi8 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_unpackhi_epi8(__U, __A, __B); } __m128i test_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_unpackhi_epi16 + // CHECK-LABEL: test_mm_mask_unpackhi_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_unpackhi_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_unpackhi_epi16 + // CHECK-LABEL: test_mm_maskz_unpackhi_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_unpackhi_epi16(__U, __A, __B); } __m256i test_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_unpackhi_epi16 + // CHECK-LABEL: test_mm256_mask_unpackhi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_unpackhi_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_unpackhi_epi16 + // CHECK-LABEL: test_mm256_maskz_unpackhi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_unpackhi_epi16(__U, __A, __B); } __m128i test_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_unpacklo_epi8 + // CHECK-LABEL: test_mm_mask_unpacklo_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_unpacklo_epi8(__W, __U, __A, __B); } __m128i test_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_unpacklo_epi8 + // CHECK-LABEL: test_mm_maskz_unpacklo_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_unpacklo_epi8(__U, __A, __B); } __m256i test_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_unpacklo_epi8 + // CHECK-LABEL: test_mm256_mask_unpacklo_epi8 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_unpacklo_epi8(__W, __U, __A, __B); } __m256i test_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi8 + // CHECK-LABEL: test_mm256_maskz_unpacklo_epi8 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_unpacklo_epi8(__U, __A, __B); } __m128i test_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_unpacklo_epi16 + // CHECK-LABEL: test_mm_mask_unpacklo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_unpacklo_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_unpacklo_epi16 + // CHECK-LABEL: test_mm_maskz_unpacklo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_unpacklo_epi16(__U, __A, __B); } __m256i test_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_unpacklo_epi16 + // CHECK-LABEL: test_mm256_mask_unpacklo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_unpacklo_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_unpacklo_epi16 + // CHECK-LABEL: test_mm256_maskz_unpacklo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_unpacklo_epi16(__U, __A, __B); } __m128i test_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_cvtepi8_epi16 + // CHECK-LABEL: test_mm_mask_cvtepi8_epi16 // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_cvtepi8_epi16(__W, __U, __A); } __m128i test_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_cvtepi8_epi16 + // CHECK-LABEL: test_mm_maskz_cvtepi8_epi16 // CHECK: sext <8 x i8> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_cvtepi8_epi16(__U, __A); } __m256i test_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm256_mask_cvtepi8_epi16 + // CHECK-LABEL: test_mm256_mask_cvtepi8_epi16 // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_cvtepi8_epi16(__W, __U, __A); } __m256i test_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm256_maskz_cvtepi8_epi16 + // CHECK-LABEL: test_mm256_maskz_cvtepi8_epi16 // CHECK: sext <16 x i8> %{{.*}} to <16 x i16> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_cvtepi8_epi16(__U, __A); } __m128i test_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_cvtepu8_epi16 + // CHECK-LABEL: test_mm_mask_cvtepu8_epi16 // CHECK: zext <8 x i8> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_cvtepu8_epi16(__W, __U, __A); } __m128i test_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_cvtepu8_epi16 + // CHECK-LABEL: test_mm_maskz_cvtepu8_epi16 // CHECK: zext <8 x i8> %{{.*}} to <8 x i16> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_cvtepu8_epi16(__U, __A); } __m256i test_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm256_mask_cvtepu8_epi16 + // CHECK-LABEL: test_mm256_mask_cvtepu8_epi16 // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_cvtepu8_epi16(__W, __U, __A); } __m256i test_mm256_maskz_cvtepu8_epi16(__mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm256_maskz_cvtepu8_epi16 + // CHECK-LABEL: test_mm256_maskz_cvtepu8_epi16 // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_cvtepu8_epi16(__U, __A); } __m256i test_mm256_sllv_epi16(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_sllv_epi16 + // CHECK-LABEL: test_mm256_sllv_epi16 // CHECK: @llvm.x86.avx512.psllv.w.256( return _mm256_sllv_epi16(__A, __B); } __m256i test_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_sllv_epi16 + // CHECK-LABEL: test_mm256_mask_sllv_epi16 // CHECK: @llvm.x86.avx512.psllv.w.256( // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_sllv_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_sllv_epi16 + // CHECK-LABEL: test_mm256_maskz_sllv_epi16 // CHECK: @llvm.x86.avx512.psllv.w.256( // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_sllv_epi16(__U, __A, __B); } __m128i test_mm_sllv_epi16(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_sllv_epi16 + // CHECK-LABEL: test_mm_sllv_epi16 // CHECK: @llvm.x86.avx512.psllv.w.128( return _mm_sllv_epi16(__A, __B); } __m128i test_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_sllv_epi16 + // CHECK-LABEL: test_mm_mask_sllv_epi16 // CHECK: @llvm.x86.avx512.psllv.w.128( // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_sllv_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_sllv_epi16 + // CHECK-LABEL: test_mm_maskz_sllv_epi16 // CHECK: @llvm.x86.avx512.psllv.w.128( // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_sllv_epi16(__U, __A, __B); } __m128i test_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_sll_epi16 + // CHECK-LABEL: test_mm_mask_sll_epi16 // CHECK: @llvm.x86.sse2.psll.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_sll_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_sll_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_sll_epi16 + // CHECK-LABEL: test_mm_maskz_sll_epi16 // CHECK: @llvm.x86.sse2.psll.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_sll_epi16(__U, __A, __B); } __m256i test_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_mask_sll_epi16 + // CHECK-LABEL: test_mm256_mask_sll_epi16 // CHECK: @llvm.x86.avx2.psll.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_sll_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_maskz_sll_epi16 + // CHECK-LABEL: test_mm256_maskz_sll_epi16 // CHECK: @llvm.x86.avx2.psll.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_sll_epi16(__U, __A, __B); } __m128i test_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_slli_epi16 + // CHECK-LABEL: test_mm_mask_slli_epi16 // CHECK: @llvm.x86.sse2.pslli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_slli_epi16(__W, __U, __A, 5); } __m128i test_mm_mask_slli_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm_mask_slli_epi16_2 + // CHECK-LABEL: test_mm_mask_slli_epi16_2 // CHECK: @llvm.x86.sse2.pslli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_slli_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_slli_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_slli_epi16 + // CHECK-LABEL: test_mm_maskz_slli_epi16 // CHECK: @llvm.x86.sse2.pslli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_slli_epi16(__U, __A, 5); } __m128i test_mm_maskz_slli_epi16_2(__mmask8 __U, __m128i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm_maskz_slli_epi16_2 + // CHECK-LABEL: test_mm_maskz_slli_epi16_2 // CHECK: @llvm.x86.sse2.pslli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_slli_epi16(__U, __A, __B); } __m256i test_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_slli_epi16 + // CHECK-LABEL: test_mm256_mask_slli_epi16 // CHECK: @llvm.x86.avx2.pslli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_slli_epi16(__W, __U, __A, 5); } __m256i test_mm256_mask_slli_epi16_2(__m256i __W, __mmask16 __U, __m256i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm256_mask_slli_epi16_2 + // CHECK-LABEL: test_mm256_mask_slli_epi16_2 // CHECK: @llvm.x86.avx2.pslli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_slli_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_slli_epi16 + // CHECK-LABEL: test_mm256_maskz_slli_epi16 // CHECK: @llvm.x86.avx2.pslli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_slli_epi16(__U, __A, 5); } __m256i test_mm256_maskz_slli_epi16_2(__mmask16 __U, __m256i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm256_maskz_slli_epi16_2 + // CHECK-LABEL: test_mm256_maskz_slli_epi16_2 // CHECK: @llvm.x86.avx2.pslli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_slli_epi16(__U, __A, __B); } __m256i test_mm256_srlv_epi16(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_srlv_epi16 + // CHECK-LABEL: test_mm256_srlv_epi16 // CHECK: @llvm.x86.avx512.psrlv.w.256( return _mm256_srlv_epi16(__A, __B); } __m256i test_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_srlv_epi16 + // CHECK-LABEL: test_mm256_mask_srlv_epi16 // CHECK: @llvm.x86.avx512.psrlv.w.256( // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srlv_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_srlv_epi16 + // CHECK-LABEL: test_mm256_maskz_srlv_epi16 // CHECK: @llvm.x86.avx512.psrlv.w.256( // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srlv_epi16(__U, __A, __B); } __m128i test_mm_srlv_epi16(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_srlv_epi16 + // CHECK-LABEL: test_mm_srlv_epi16 // CHECK: @llvm.x86.avx512.psrlv.w.128( return _mm_srlv_epi16(__A, __B); } __m128i test_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_srlv_epi16 + // CHECK-LABEL: test_mm_mask_srlv_epi16 // CHECK: @llvm.x86.avx512.psrlv.w.128( // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srlv_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_srlv_epi16 + // CHECK-LABEL: test_mm_maskz_srlv_epi16 // CHECK: @llvm.x86.avx512.psrlv.w.128( // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srlv_epi16(__U, __A, __B); } __m128i test_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_srl_epi16 + // CHECK-LABEL: test_mm_mask_srl_epi16 // CHECK: @llvm.x86.sse2.psrl.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srl_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_srl_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_srl_epi16 + // CHECK-LABEL: test_mm_maskz_srl_epi16 // CHECK: @llvm.x86.sse2.psrl.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srl_epi16(__U, __A, __B); } __m256i test_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_mask_srl_epi16 + // CHECK-LABEL: test_mm256_mask_srl_epi16 // CHECK: @llvm.x86.avx2.psrl.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srl_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_maskz_srl_epi16 + // CHECK-LABEL: test_mm256_maskz_srl_epi16 // CHECK: @llvm.x86.avx2.psrl.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srl_epi16(__U, __A, __B); } __m128i test_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_srli_epi16 + // CHECK-LABEL: test_mm_mask_srli_epi16 // CHECK: @llvm.x86.sse2.psrli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srli_epi16(__W, __U, __A, 5); } __m128i test_mm_mask_srli_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, int __B) { - // CHECK-LABEL: @test_mm_mask_srli_epi16_2 + // CHECK-LABEL: test_mm_mask_srli_epi16_2 // CHECK: @llvm.x86.sse2.psrli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srli_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_srli_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_srli_epi16 + // CHECK-LABEL: test_mm_maskz_srli_epi16 // CHECK: @llvm.x86.sse2.psrli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srli_epi16(__U, __A, 5); } __m128i test_mm_maskz_srli_epi16_2(__mmask8 __U, __m128i __A, int __B) { - // CHECK-LABEL: @test_mm_maskz_srli_epi16_2 + // CHECK-LABEL: test_mm_maskz_srli_epi16_2 // CHECK: @llvm.x86.sse2.psrli.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srli_epi16(__U, __A, __B); } __m256i test_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_srli_epi16 + // CHECK-LABEL: test_mm256_mask_srli_epi16 // CHECK: @llvm.x86.avx2.psrli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srli_epi16(__W, __U, __A, 5); } __m256i test_mm256_mask_srli_epi16_2(__m256i __W, __mmask16 __U, __m256i __A, int __B) { - // CHECK-LABEL: @test_mm256_mask_srli_epi16_2 + // CHECK-LABEL: test_mm256_mask_srli_epi16_2 // CHECK: @llvm.x86.avx2.psrli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srli_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_srli_epi16 + // CHECK-LABEL: test_mm256_maskz_srli_epi16 // CHECK: @llvm.x86.avx2.psrli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srli_epi16(__U, __A, 5); } __m256i test_mm256_maskz_srli_epi16_2(__mmask16 __U, __m256i __A, int __B) { - // CHECK-LABEL: @test_mm256_maskz_srli_epi16_2 + // CHECK-LABEL: test_mm256_maskz_srli_epi16_2 // CHECK: @llvm.x86.avx2.psrli.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srli_epi16(__U, __A, __B); } __m256i test_mm256_srav_epi16(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_srav_epi16 + // CHECK-LABEL: test_mm256_srav_epi16 // CHECK: @llvm.x86.avx512.psrav.w.256( return _mm256_srav_epi16(__A, __B); } __m256i test_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_srav_epi16 + // CHECK-LABEL: test_mm256_mask_srav_epi16 // CHECK: @llvm.x86.avx512.psrav.w.256( // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srav_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_srav_epi16 + // CHECK-LABEL: test_mm256_maskz_srav_epi16 // CHECK: @llvm.x86.avx512.psrav.w.256( // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srav_epi16(__U, __A, __B); } __m128i test_mm_srav_epi16(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_srav_epi16 + // CHECK-LABEL: test_mm_srav_epi16 // CHECK: @llvm.x86.avx512.psrav.w.128( return _mm_srav_epi16(__A, __B); } __m128i test_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_srav_epi16 + // CHECK-LABEL: test_mm_mask_srav_epi16 // CHECK: @llvm.x86.avx512.psrav.w.128( // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srav_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_srav_epi16 + // CHECK-LABEL: test_mm_maskz_srav_epi16 // CHECK: @llvm.x86.avx512.psrav.w.128( // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srav_epi16(__U, __A, __B); } __m128i test_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_sra_epi16 + // CHECK-LABEL: test_mm_mask_sra_epi16 // CHECK: @llvm.x86.sse2.psra.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_sra_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_sra_epi16 + // CHECK-LABEL: test_mm_maskz_sra_epi16 // CHECK: @llvm.x86.sse2.psra.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_sra_epi16(__U, __A, __B); } __m256i test_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_mask_sra_epi16 + // CHECK-LABEL: test_mm256_mask_sra_epi16 // CHECK: @llvm.x86.avx2.psra.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_sra_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) { - // CHECK-LABEL: @test_mm256_maskz_sra_epi16 + // CHECK-LABEL: test_mm256_maskz_sra_epi16 // CHECK: @llvm.x86.avx2.psra.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_sra_epi16(__U, __A, __B); } __m128i test_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_srai_epi16 + // CHECK-LABEL: test_mm_mask_srai_epi16 // CHECK: @llvm.x86.sse2.psrai.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srai_epi16(__W, __U, __A, 5); } __m128i test_mm_mask_srai_epi16_2(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm_mask_srai_epi16_2 + // CHECK-LABEL: test_mm_mask_srai_epi16_2 // CHECK: @llvm.x86.sse2.psrai.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_srai_epi16(__W, __U, __A, __B); } __m128i test_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_srai_epi16 + // CHECK-LABEL: test_mm_maskz_srai_epi16 // CHECK: @llvm.x86.sse2.psrai.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srai_epi16(__U, __A, 5); } __m128i test_mm_maskz_srai_epi16_2(__mmask8 __U, __m128i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm_maskz_srai_epi16_2 + // CHECK-LABEL: test_mm_maskz_srai_epi16_2 // CHECK: @llvm.x86.sse2.psrai.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_srai_epi16(__U, __A, __B); } __m256i test_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_srai_epi16 + // CHECK-LABEL: test_mm256_mask_srai_epi16 // CHECK: @llvm.x86.avx2.psrai.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srai_epi16(__W, __U, __A, 5); } __m256i test_mm256_mask_srai_epi16_2(__m256i __W, __mmask16 __U, __m256i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm256_mask_srai_epi16_2 + // CHECK-LABEL: test_mm256_mask_srai_epi16_2 // CHECK: @llvm.x86.avx2.psrai.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_srai_epi16(__W, __U, __A, __B); } __m256i test_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_srai_epi16 + // CHECK-LABEL: test_mm256_maskz_srai_epi16 // CHECK: @llvm.x86.avx2.psrai.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srai_epi16(__U, __A, 5); } __m256i test_mm256_maskz_srai_epi16_2(__mmask16 __U, __m256i __A, unsigned int __B) { - // CHECK-LABEL: @test_mm256_maskz_srai_epi16_2 + // CHECK-LABEL: test_mm256_maskz_srai_epi16_2 // CHECK: @llvm.x86.avx2.psrai.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_srai_epi16(__U, __A, __B); } __m128i test_mm_mask_mov_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_mov_epi16 + // CHECK-LABEL: test_mm_mask_mov_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_mov_epi16(__W, __U, __A); } __m128i test_mm_maskz_mov_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_mov_epi16 + // CHECK-LABEL: test_mm_maskz_mov_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_mov_epi16(__U, __A); } __m256i test_mm256_mask_mov_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_mov_epi16 + // CHECK-LABEL: test_mm256_mask_mov_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_mov_epi16(__W, __U, __A); } __m256i test_mm256_maskz_mov_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_mov_epi16 + // CHECK-LABEL: test_mm256_maskz_mov_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_mov_epi16(__U, __A); } __m128i test_mm_mask_mov_epi8(__m128i __W, __mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_mov_epi8 + // CHECK-LABEL: test_mm_mask_mov_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_mov_epi8(__W, __U, __A); } __m128i test_mm_maskz_mov_epi8(__mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_mov_epi8 + // CHECK-LABEL: test_mm_maskz_mov_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_mov_epi8(__U, __A); } __m256i test_mm256_mask_mov_epi8(__m256i __W, __mmask32 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_mov_epi8 + // CHECK-LABEL: test_mm256_mask_mov_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_mov_epi8(__W, __U, __A); } __m256i test_mm256_maskz_mov_epi8(__mmask32 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_mov_epi8 + // CHECK-LABEL: test_mm256_maskz_mov_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_mov_epi8(__U, __A); } __m128i test_mm_loadu_epi16(void const *__P) { - // CHECK-LABEL: @test_mm_loadu_epi16 + // CHECK-LABEL: test_mm_loadu_epi16 // CHECK: load <2 x i64>, ptr %{{.*}}, align 1{{$}} return _mm_loadu_epi16(__P); } __m128i test_mm_mask_loadu_epi16(__m128i __W, __mmask8 __U, void const *__P) { - // CHECK-LABEL: @test_mm_mask_loadu_epi16 + // CHECK-LABEL: test_mm_mask_loadu_epi16 // CHECK: @llvm.masked.load.v8i16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}}) return _mm_mask_loadu_epi16(__W, __U, __P); } __m128i test_mm_maskz_loadu_epi16(__mmask8 __U, void const *__P) { - // CHECK-LABEL: @test_mm_maskz_loadu_epi16 + // CHECK-LABEL: test_mm_maskz_loadu_epi16 // CHECK: @llvm.masked.load.v8i16.p0(ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}, <8 x i16> %{{.*}}) return _mm_maskz_loadu_epi16(__U, __P); } __m256i test_mm256_loadu_epi16(void const *__P) { - // CHECK-LABEL: @test_mm256_loadu_epi16 + // CHECK-LABEL: test_mm256_loadu_epi16 // CHECK: load <4 x i64>, ptr %{{.*}}, align 1{{$}} return _mm256_loadu_epi16(__P); } __m256i test_mm256_mask_loadu_epi16(__m256i __W, __mmask16 __U, void const *__P) { - // CHECK-LABEL: @test_mm256_mask_loadu_epi16 + // CHECK-LABEL: test_mm256_mask_loadu_epi16 // CHECK: @llvm.masked.load.v16i16.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_mask_loadu_epi16(__W, __U, __P); } __m256i test_mm256_maskz_loadu_epi16(__mmask16 __U, void const *__P) { - // CHECK-LABEL: @test_mm256_maskz_loadu_epi16 + // CHECK-LABEL: test_mm256_maskz_loadu_epi16 // CHECK: @llvm.masked.load.v16i16.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_maskz_loadu_epi16(__U, __P); } __m128i test_mm_loadu_epi8(void const *__P) { - // CHECK-LABEL: @test_mm_loadu_epi8 + // CHECK-LABEL: test_mm_loadu_epi8 // CHECK: load <2 x i64>, ptr %{{.*}}, align 1{{$}} return _mm_loadu_epi8(__P); } __m128i test_mm_mask_loadu_epi8(__m128i __W, __mmask16 __U, void const *__P) { - // CHECK-LABEL: @test_mm_mask_loadu_epi8 + // CHECK-LABEL: test_mm_mask_loadu_epi8 // CHECK: @llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}}) return _mm_mask_loadu_epi8(__W, __U, __P); } __m128i test_mm_maskz_loadu_epi8(__mmask16 __U, void const *__P) { - // CHECK-LABEL: @test_mm_maskz_loadu_epi8 + // CHECK-LABEL: test_mm_maskz_loadu_epi8 // CHECK: @llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}, <16 x i8> %{{.*}}) return _mm_maskz_loadu_epi8(__U, __P); } __m256i test_mm256_loadu_epi8(void const *__P) { - // CHECK-LABEL: @test_mm256_loadu_epi8 + // CHECK-LABEL: test_mm256_loadu_epi8 // CHECK: load <4 x i64>, ptr %{{.*}}, align 1{{$}} return _mm256_loadu_epi8(__P); } __m256i test_mm256_mask_loadu_epi8(__m256i __W, __mmask32 __U, void const *__P) { - // CHECK-LABEL: @test_mm256_mask_loadu_epi8 + // CHECK-LABEL: test_mm256_mask_loadu_epi8 // CHECK: @llvm.masked.load.v32i8.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_mask_loadu_epi8(__W, __U, __P); } __m256i test_mm256_maskz_loadu_epi8(__mmask32 __U, void const *__P) { - // CHECK-LABEL: @test_mm256_maskz_loadu_epi8 + // CHECK-LABEL: test_mm256_maskz_loadu_epi8 // CHECK: @llvm.masked.load.v32i8.p0(ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_maskz_loadu_epi8(__U, __P); } void test_mm_storeu_epi16(void *__p, __m128i __a) { - // check-label: @test_mm_storeu_epi16 + // CHECK-LABEL: test_mm_storeu_epi16 // check: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}} return _mm_storeu_epi16(__p, __a); } void test_mm_mask_storeu_epi16(void *__P, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_storeu_epi16 + // CHECK-LABEL: test_mm_mask_storeu_epi16 // CHECK: @llvm.masked.store.v8i16.p0(<8 x i16> %{{.*}}, ptr %{{.*}}, i32 1, <8 x i1> %{{.*}}) return _mm_mask_storeu_epi16(__P, __U, __A); } void test_mm256_storeu_epi16(void *__P, __m256i __A) { - // CHECK-LABEL: @test_mm256_storeu_epi16 + // CHECK-LABEL: test_mm256_storeu_epi16 // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}} return _mm256_storeu_epi16(__P, __A); } void test_mm256_mask_storeu_epi16(void *__P, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_storeu_epi16 + // CHECK-LABEL: test_mm256_mask_storeu_epi16 // CHECK: @llvm.masked.store.v16i16.p0(<16 x i16> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}) return _mm256_mask_storeu_epi16(__P, __U, __A); } void test_mm_storeu_epi8(void *__p, __m128i __a) { - // check-label: @test_mm_storeu_epi8 + // CHECK-LABEL: test_mm_storeu_epi8 // check: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}} return _mm_storeu_epi8(__p, __a); } void test_mm_mask_storeu_epi8(void *__P, __mmask16 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_storeu_epi8 + // CHECK-LABEL: test_mm_mask_storeu_epi8 // CHECK: @llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %{{.*}}) return _mm_mask_storeu_epi8(__P, __U, __A); } void test_mm256_storeu_epi8(void *__P, __m256i __A) { - // CHECK-LABEL: @test_mm256_storeu_epi8 + // CHECK-LABEL: test_mm256_storeu_epi8 // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 1{{$}} return _mm256_storeu_epi8(__P, __A); } void test_mm256_mask_storeu_epi8(void *__P, __mmask32 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_storeu_epi8 + // CHECK-LABEL: test_mm256_mask_storeu_epi8 // CHECK: @llvm.masked.store.v32i8.p0(<32 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <32 x i1> %{{.*}}) return _mm256_mask_storeu_epi8(__P, __U, __A); } __mmask16 test_mm_test_epi8_mask(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_test_epi8_mask + // CHECK-LABEL: test_mm_test_epi8_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}} return _mm_test_epi8_mask(__A, __B); } __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_test_epi8_mask + // CHECK-LABEL: test_mm_mask_test_epi8_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} @@ -2632,14 +2635,14 @@ __mmask16 test_mm_mask_test_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { } __mmask32 test_mm256_test_epi8_mask(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_test_epi8_mask + // CHECK-LABEL: test_mm256_test_epi8_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} return _mm256_test_epi8_mask(__A, __B); } __mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_test_epi8_mask + // CHECK-LABEL: test_mm256_mask_test_epi8_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} @@ -2647,14 +2650,14 @@ __mmask32 test_mm256_mask_test_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B } __mmask8 test_mm_test_epi16_mask(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_test_epi16_mask + // CHECK-LABEL: test_mm_test_epi16_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}} return _mm_test_epi16_mask(__A, __B); } __mmask8 test_mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_test_epi16_mask + // CHECK-LABEL: test_mm_mask_test_epi16_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} @@ -2662,14 +2665,14 @@ __mmask8 test_mm_mask_test_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { } __mmask16 test_mm256_test_epi16_mask(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_test_epi16_mask + // CHECK-LABEL: test_mm256_test_epi16_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}} return _mm256_test_epi16_mask(__A, __B); } __mmask16 test_mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_test_epi16_mask + // CHECK-LABEL: test_mm256_mask_test_epi16_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp ne <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} @@ -2677,14 +2680,14 @@ __mmask16 test_mm256_mask_test_epi16_mask(__mmask16 __U, __m256i __A, __m256i __ } __mmask16 test_mm_testn_epi8_mask(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_testn_epi8_mask + // CHECK-LABEL: test_mm_testn_epi8_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} return _mm_testn_epi8_mask(__A, __B); } __mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_testn_epi8_mask + // CHECK-LABEL: test_mm_mask_testn_epi8_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <16 x i8> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} @@ -2692,14 +2695,14 @@ __mmask16 test_mm_mask_testn_epi8_mask(__mmask16 __U, __m128i __A, __m128i __B) } __mmask32 test_mm256_testn_epi8_mask(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_testn_epi8_mask + // CHECK-LABEL: test_mm256_testn_epi8_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} return _mm256_testn_epi8_mask(__A, __B); } __mmask32 test_mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_testn_epi8_mask + // CHECK-LABEL: test_mm256_mask_testn_epi8_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <32 x i8> %{{.*}}, %{{.*}} // CHECK: and <32 x i1> %{{.*}}, %{{.*}} @@ -2707,14 +2710,14 @@ __mmask32 test_mm256_mask_testn_epi8_mask(__mmask32 __U, __m256i __A, __m256i __ } __mmask8 test_mm_testn_epi16_mask(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_testn_epi16_mask + // CHECK-LABEL: test_mm_testn_epi16_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} return _mm_testn_epi16_mask(__A, __B); } __mmask8 test_mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_testn_epi16_mask + // CHECK-LABEL: test_mm_mask_testn_epi16_mask // CHECK: and <2 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <8 x i16> %{{.*}}, %{{.*}} // CHECK: and <8 x i1> %{{.*}}, %{{.*}} @@ -2722,14 +2725,14 @@ __mmask8 test_mm_mask_testn_epi16_mask(__mmask8 __U, __m128i __A, __m128i __B) { } __mmask16 test_mm256_testn_epi16_mask(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_testn_epi16_mask + // CHECK-LABEL: test_mm256_testn_epi16_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} return _mm256_testn_epi16_mask(__A, __B); } __mmask16 test_mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_testn_epi16_mask + // CHECK-LABEL: test_mm256_mask_testn_epi16_mask // CHECK: and <4 x i64> %{{.*}}, %{{.*}} // CHECK: icmp eq <16 x i16> %{{.*}}, %{{.*}} // CHECK: and <16 x i1> %{{.*}}, %{{.*}} @@ -2737,102 +2740,102 @@ __mmask16 test_mm256_mask_testn_epi16_mask(__mmask16 __U, __m256i __A, __m256i _ } __mmask16 test_mm_movepi8_mask(__m128i __A) { - // CHECK-LABEL: @test_mm_movepi8_mask + // CHECK-LABEL: test_mm_movepi8_mask // CHECK: [[CMP:%.*]] = icmp slt <16 x i8> %{{.*}}, zeroinitializer return _mm_movepi8_mask(__A); } __mmask32 test_mm256_movepi8_mask(__m256i __A) { - // CHECK-LABEL: @test_mm256_movepi8_mask + // CHECK-LABEL: test_mm256_movepi8_mask // CHECK: [[CMP:%.*]] = icmp slt <32 x i8> %{{.*}}, zeroinitializer return _mm256_movepi8_mask(__A); } __m128i test_mm_movm_epi8(__mmask16 __A) { - // CHECK-LABEL: @test_mm_movm_epi8 + // CHECK-LABEL: test_mm_movm_epi8 // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %vpmovm2.i = sext <16 x i1> %{{.*}} to <16 x i8> return _mm_movm_epi8(__A); } __m256i test_mm256_movm_epi8(__mmask32 __A) { - // CHECK-LABEL: @test_mm256_movm_epi8 + // CHECK-LABEL: test_mm256_movm_epi8 // CHECK: %{{.*}} = bitcast i32 %{{.*}} to <32 x i1> // CHECK: %vpmovm2.i = sext <32 x i1> %{{.*}} to <32 x i8> return _mm256_movm_epi8(__A); } __m128i test_mm_movm_epi16(__mmask8 __A) { - // CHECK-LABEL: @test_mm_movm_epi16 + // CHECK-LABEL: test_mm_movm_epi16 // CHECK: %{{.*}} = bitcast i8 %{{.*}} to <8 x i1> // CHECK: %vpmovm2.i = sext <8 x i1> %{{.*}} to <8 x i16> return _mm_movm_epi16(__A); } __m256i test_mm256_movm_epi16(__mmask16 __A) { - // CHECK-LABEL: @test_mm256_movm_epi16 + // CHECK-LABEL: test_mm256_movm_epi16 // CHECK: %{{.*}} = bitcast i16 %{{.*}} to <16 x i1> // CHECK: %vpmovm2.i = sext <16 x i1> %{{.*}} to <16 x i16> return _mm256_movm_epi16(__A); } __m128i test_mm_mask_broadcastb_epi8(__m128i __O, __mmask16 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_broadcastb_epi8 + // CHECK-LABEL: test_mm_mask_broadcastb_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_broadcastb_epi8(__O, __M, __A); } __m128i test_mm_maskz_broadcastb_epi8(__mmask16 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_broadcastb_epi8 + // CHECK-LABEL: test_mm_maskz_broadcastb_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_broadcastb_epi8(__M, __A); } __m256i test_mm256_mask_broadcastb_epi8(__m256i __O, __mmask32 __M, __m128i __A) { - // CHECK-LABEL: @test_mm256_mask_broadcastb_epi8 + // CHECK-LABEL: test_mm256_mask_broadcastb_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_broadcastb_epi8(__O, __M, __A); } __m256i test_mm256_maskz_broadcastb_epi8(__mmask32 __M, __m128i __A) { - // CHECK-LABEL: @test_mm256_maskz_broadcastb_epi8 + // CHECK-LABEL: test_mm256_maskz_broadcastb_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <32 x i32> zeroinitializer // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_broadcastb_epi8(__M, __A); } __m128i test_mm_mask_broadcastw_epi16(__m128i __O, __mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_broadcastw_epi16 + // CHECK-LABEL: test_mm_mask_broadcastw_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_broadcastw_epi16(__O, __M, __A); } __m128i test_mm_maskz_broadcastw_epi16(__mmask8 __M, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_broadcastw_epi16 + // CHECK-LABEL: test_mm_maskz_broadcastw_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> zeroinitializer // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_broadcastw_epi16(__M, __A); } __m256i test_mm256_mask_broadcastw_epi16(__m256i __O, __mmask16 __M, __m128i __A) { - // CHECK-LABEL: @test_mm256_mask_broadcastw_epi16 + // CHECK-LABEL: test_mm256_mask_broadcastw_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_broadcastw_epi16(__O, __M, __A); } __m256i test_mm256_maskz_broadcastw_epi16(__mmask16 __M, __m128i __A) { - // CHECK-LABEL: @test_mm256_maskz_broadcastw_epi16 + // CHECK-LABEL: test_mm256_maskz_broadcastw_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <16 x i32> zeroinitializer // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_broadcastw_epi16(__M, __A); } __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ - // CHECK-LABEL: @test_mm_mask_set1_epi8 + // CHECK-LABEL: test_mm_mask_set1_epi8 // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0 // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 @@ -2853,7 +2856,7 @@ __m128i test_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A){ return _mm_mask_set1_epi8(__O, __M, __A); } __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ - // CHECK-LABEL: @test_mm_maskz_set1_epi8 + // CHECK-LABEL: test_mm_maskz_set1_epi8 // CHECK: insertelement <16 x i8> poison, i8 %{{.*}}, i32 0 // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 1 // CHECK: insertelement <16 x i8> %{{.*}}, i8 %{{.*}}, i32 2 @@ -2875,7 +2878,7 @@ __m128i test_mm_maskz_set1_epi8 ( __mmask16 __M, char __A){ } __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { - // CHECK-LABEL: @test_mm256_mask_set1_epi8 + // CHECK-LABEL: test_mm256_mask_set1_epi8 // CHECK: insertelement <32 x i8> poison, i8 %{{.*}}, i32 0 // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 @@ -2913,7 +2916,7 @@ __m256i test_mm256_mask_set1_epi8(__m256i __O, __mmask32 __M, char __A) { } __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) { - // CHECK-LABEL: @test_mm256_maskz_set1_epi8 + // CHECK-LABEL: test_mm256_maskz_set1_epi8 // CHECK: insertelement <32 x i8> poison, i8 %{{.*}}, i32 0 // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 1 // CHECK: insertelement <32 x i8> %{{.*}}, i8 %{{.*}}, i32 2 @@ -2952,7 +2955,7 @@ __m256i test_mm256_maskz_set1_epi8( __mmask32 __M, char __A) { __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { - // CHECK-LABEL: @test_mm256_mask_set1_epi16 + // CHECK-LABEL: test_mm256_mask_set1_epi16 // CHECK: insertelement <16 x i16> poison, i16 %{{.*}}, i32 0 // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1 // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2 @@ -2974,7 +2977,7 @@ __m256i test_mm256_mask_set1_epi16(__m256i __O, __mmask16 __M, short __A) { } __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) { - // CHECK-LABEL: @test_mm256_maskz_set1_epi16 + // CHECK-LABEL: test_mm256_maskz_set1_epi16 // CHECK: insertelement <16 x i16> poison, i16 %{{.*}}, i32 0 // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 1 // CHECK: insertelement <16 x i16> %{{.*}}, i16 %{{.*}}, i32 2 @@ -2996,7 +2999,7 @@ __m256i test_mm256_maskz_set1_epi16(__mmask16 __M, short __A) { } __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { - // CHECK-LABEL: @test_mm_mask_set1_epi16 + // CHECK-LABEL: test_mm_mask_set1_epi16 // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0 // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1 // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2 @@ -3010,7 +3013,7 @@ __m128i test_mm_mask_set1_epi16(__m128i __O, __mmask8 __M, short __A) { } __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) { - // CHECK-LABEL: @test_mm_maskz_set1_epi16 + // CHECK-LABEL: test_mm_maskz_set1_epi16 // CHECK: insertelement <8 x i16> poison, i16 %{{.*}}, i32 0 // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 1 // CHECK: insertelement <8 x i16> %{{.*}}, i16 %{{.*}}, i32 2 @@ -3023,174 +3026,174 @@ __m128i test_mm_maskz_set1_epi16(__mmask8 __M, short __A) { return _mm_maskz_set1_epi16(__M, __A); } __m128i test_mm_permutexvar_epi16(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_permutexvar_epi16 + // CHECK-LABEL: test_mm_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 return _mm_permutexvar_epi16(__A, __B); } __m128i test_mm_maskz_permutexvar_epi16(__mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_permutexvar_epi16 + // CHECK-LABEL: test_mm_maskz_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_permutexvar_epi16(__M, __A, __B); } __m128i test_mm_mask_permutexvar_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_permutexvar_epi16 + // CHECK-LABEL: test_mm_mask_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_permutexvar_epi16(__W, __M, __A, __B); } __m256i test_mm256_permutexvar_epi16(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_permutexvar_epi16 + // CHECK-LABEL: test_mm256_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.256 return _mm256_permutexvar_epi16(__A, __B); } __m256i test_mm256_maskz_permutexvar_epi16(__mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_permutexvar_epi16 + // CHECK-LABEL: test_mm256_maskz_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_permutexvar_epi16(__M, __A, __B); } __m256i test_mm256_mask_permutexvar_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_permutexvar_epi16 + // CHECK-LABEL: test_mm256_mask_permutexvar_epi16 // CHECK: @llvm.x86.avx512.permvar.hi.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_permutexvar_epi16(__W, __M, __A, __B); } __m128i test_mm_mask_alignr_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_alignr_epi8 + // CHECK-LABEL: test_mm_mask_alignr_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_alignr_epi8(__W, __U, __A, __B, 2); } __m128i test_mm_maskz_alignr_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_alignr_epi8 + // CHECK-LABEL: test_mm_maskz_alignr_epi8 // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> %{{.*}}, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_alignr_epi8(__U, __A, __B, 2); } __m256i test_mm256_mask_alignr_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_alignr_epi8 + // CHECK-LABEL: test_mm256_mask_alignr_epi8 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_alignr_epi8(__W, __U, __A, __B, 2); } __m256i test_mm256_maskz_alignr_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_alignr_epi8 + // CHECK-LABEL: test_mm256_maskz_alignr_epi8 // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_alignr_epi8(__U, __A, __B, 2); } __m128i test_mm_dbsad_epu8(__m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_dbsad_epu8 + // CHECK-LABEL: test_mm_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.128 return _mm_dbsad_epu8(__A, __B, 170); } __m128i test_mm_mask_dbsad_epu8(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_mask_dbsad_epu8 + // CHECK-LABEL: test_mm_mask_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_dbsad_epu8(__W, __U, __A, __B, 170); } __m128i test_mm_maskz_dbsad_epu8(__mmask8 __U, __m128i __A, __m128i __B) { - // CHECK-LABEL: @test_mm_maskz_dbsad_epu8 + // CHECK-LABEL: test_mm_maskz_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.128 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_dbsad_epu8(__U, __A, __B, 170); } __m256i test_mm256_dbsad_epu8(__m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_dbsad_epu8 + // CHECK-LABEL: test_mm256_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 return _mm256_dbsad_epu8(__A, __B, 170); } __m256i test_mm256_mask_dbsad_epu8(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_mask_dbsad_epu8 + // CHECK-LABEL: test_mm256_mask_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_dbsad_epu8(__W, __U, __A, __B, 170); } __m256i test_mm256_maskz_dbsad_epu8(__mmask16 __U, __m256i __A, __m256i __B) { - // CHECK-LABEL: @test_mm256_maskz_dbsad_epu8 + // CHECK-LABEL: test_mm256_maskz_dbsad_epu8 // CHECK: @llvm.x86.avx512.dbpsadbw.256 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_dbsad_epu8(__U, __A, __B, 170); } __mmask8 test_mm_movepi16_mask(__m128i __A) { - // CHECK-LABEL: @test_mm_movepi16_mask + // CHECK-LABEL: test_mm_movepi16_mask // CHECK: [[CMP:%.*]] = icmp slt <8 x i16> %{{.*}}, zeroinitializer return _mm_movepi16_mask(__A); } __mmask16 test_mm256_movepi16_mask(__m256i __A) { - // CHECK-LABEL: @test_mm256_movepi16_mask + // CHECK-LABEL: test_mm256_movepi16_mask // CHECK: [[CMP:%.*]] = icmp slt <16 x i16> %{{.*}}, zeroinitializer return _mm256_movepi16_mask(__A); } __m128i test_mm_mask_shufflehi_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_shufflehi_epi16 + // CHECK-LABEL: test_mm_mask_shufflehi_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_shufflehi_epi16(__W, __U, __A, 5); } __m128i test_mm_maskz_shufflehi_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_shufflehi_epi16 + // CHECK-LABEL: test_mm_maskz_shufflehi_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_shufflehi_epi16(__U, __A, 5); } __m128i test_mm_mask_shufflelo_epi16(__m128i __W, __mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_mask_shufflelo_epi16 + // CHECK-LABEL: test_mm_mask_shufflelo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_shufflelo_epi16(__W, __U, __A, 5); } __m128i test_mm_maskz_shufflelo_epi16(__mmask8 __U, __m128i __A) { - // CHECK-LABEL: @test_mm_maskz_shufflelo_epi16 + // CHECK-LABEL: test_mm_maskz_shufflelo_epi16 // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_shufflelo_epi16(__U, __A, 5); } __m256i test_mm256_mask_shufflehi_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_shufflehi_epi16 + // CHECK-LABEL: test_mm256_mask_shufflehi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_shufflehi_epi16(__W, __U, __A, 5); } __m256i test_mm256_maskz_shufflehi_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_shufflehi_epi16 + // CHECK-LABEL: test_mm256_maskz_shufflehi_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_shufflehi_epi16(__U, __A, 5); } __m256i test_mm256_mask_shufflelo_epi16(__m256i __W, __mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_mask_shufflelo_epi16 + // CHECK-LABEL: test_mm256_mask_shufflelo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_shufflelo_epi16(__W, __U, __A, 5); } __m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) { - // CHECK-LABEL: @test_mm256_maskz_shufflelo_epi16 + // CHECK-LABEL: test_mm256_maskz_shufflelo_epi16 // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> poison, <16 x i32> // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_shufflelo_epi16(__U, __A, 5); @@ -3198,42 +3201,42 @@ __m256i test_mm256_maskz_shufflelo_epi16(__mmask16 __U, __m256i __A) { void test_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - // CHECK-LABEL:@test_mm_mask_cvtepi16_storeu_epi8 + // CHECK-LABEL: test_mm_mask_cvtepi16_storeu_epi8 // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.128 _mm_mask_cvtepi16_storeu_epi8 (__P, __M, __A); } void test_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - // CHECK-LABEL:@test_mm_mask_cvtsepi16_storeu_epi8 + // CHECK-LABEL: test_mm_mask_cvtsepi16_storeu_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.128 _mm_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A); } void test_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) { - // CHECK-LABEL:@test_mm_mask_cvtusepi16_storeu_epi8 + // CHECK-LABEL: test_mm_mask_cvtusepi16_storeu_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.128 _mm_mask_cvtusepi16_storeu_epi8 (__P, __M, __A); } void test_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) { - // CHECK-LABEL:@test_mm256_mask_cvtusepi16_storeu_epi8 + // CHECK-LABEL: test_mm256_mask_cvtusepi16_storeu_epi8 // CHECK: @llvm.x86.avx512.mask.pmovus.wb.mem.256 _mm256_mask_cvtusepi16_storeu_epi8 ( __P, __M, __A); } void test_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) { - // CHECK-LABEL:@test_mm256_mask_cvtepi16_storeu_epi8 + // CHECK-LABEL: test_mm256_mask_cvtepi16_storeu_epi8 // CHECK: @llvm.x86.avx512.mask.pmov.wb.mem.256 _mm256_mask_cvtepi16_storeu_epi8 ( __P, __M, __A); } void test_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) { - // CHECK-LABEL:@test_mm256_mask_cvtsepi16_storeu_epi8 + // CHECK-LABEL: test_mm256_mask_cvtsepi16_storeu_epi8 // CHECK: @llvm.x86.avx512.mask.pmovs.wb.mem.256 _mm256_mask_cvtsepi16_storeu_epi8 ( __P, __M, __A); } diff --git a/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c b/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c index 0a1692001efa..faa3b54624a7 100644 --- a/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c +++ b/clang/test/CodeGen/X86/avx512vlbw-reduceIntrin.c @@ -1,420 +1,426 @@ -// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=x86_64 -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s -// RUN: %clang_cc1 -ffreestanding %s -O0 -triple=i386 -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64 -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=x86_64 -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=i386 -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c -ffreestanding %s -O0 -triple=i386 -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=x86_64 -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=x86_64 -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=i386 -target-feature +avx512bw -target-feature +avx512vl -emit-llvm -o - -Wall -Werror | FileCheck %s +// RUN: %clang_cc1 -x c++ -ffreestanding %s -O0 -triple=i386 -target-feature +avx512bw -target-feature +avx512vl -fno-signed-char -emit-llvm -o - -Wall -Werror | FileCheck %s #include short test_mm_reduce_add_epi16(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_add_epi16( -// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_add_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_add_epi16(__W); } short test_mm_reduce_mul_epi16(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_mul_epi16( -// CHECK: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_mul_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_mul_epi16(__W); } short test_mm_reduce_or_epi16(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_or_epi16( -// CHECK: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_or_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_or_epi16(__W); } short test_mm_reduce_and_epi16(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_and_epi16( -// CHECK: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_and_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_and_epi16(__W); } short test_mm_mask_reduce_add_epi16(__mmask8 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_add_epi16( +// CHECK-LABEL: test_mm_mask_reduce_add_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_add_epi16(__M, __W); } short test_mm_mask_reduce_mul_epi16(__mmask8 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_mul_epi16( +// CHECK-LABEL: test_mm_mask_reduce_mul_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_mul_epi16(__M, __W); } short test_mm_mask_reduce_and_epi16(__mmask8 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_and_epi16( +// CHECK-LABEL: test_mm_mask_reduce_and_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %{{.*}} +// CHECK: call {{.*}}i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %{{.*}} return _mm_mask_reduce_and_epi16(__M, __W); } short test_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_or_epi16( +// CHECK-LABEL: test_mm_mask_reduce_or_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_or_epi16(__M, __W); } short test_mm256_reduce_add_epi16(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_add_epi16( -// CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_add_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_add_epi16(__W); } short test_mm256_reduce_mul_epi16(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_mul_epi16( -// CHECK: call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_mul_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_mul_epi16(__W); } short test_mm256_reduce_or_epi16(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_or_epi16( -// CHECK: call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_or_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_or_epi16(__W); } short test_mm256_reduce_and_epi16(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_and_epi16( -// CHECK: call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_and_epi16 +// CHECK: call {{.*}}i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_and_epi16(__W); } short test_mm256_mask_reduce_add_epi16(__mmask16 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_add_epi16( +// CHECK-LABEL: test_mm256_mask_reduce_add_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_add_epi16(__M, __W); } short test_mm256_mask_reduce_mul_epi16(__mmask16 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_mul_epi16( +// CHECK-LABEL: test_mm256_mask_reduce_mul_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_mul_epi16(__M, __W); } short test_mm256_mask_reduce_and_epi16(__mmask16 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_and_epi16( +// CHECK-LABEL: test_mm256_mask_reduce_and_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_and_epi16(__M, __W); } short test_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_or_epi16( +// CHECK-LABEL: test_mm256_mask_reduce_or_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_or_epi16(__M, __W); } signed char test_mm_reduce_add_epi8(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_add_epi8( -// CHECK: call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_add_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_add_epi8(__W); } signed char test_mm_reduce_mul_epi8(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_mul_epi8( -// CHECK: call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_mul_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_mul_epi8(__W); } signed char test_mm_reduce_and_epi8(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_and_epi8( -// CHECK: call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_and_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_and_epi8(__W); } signed char test_mm_reduce_or_epi8(__m128i __W){ -// CHECK-LABEL: @test_mm_reduce_or_epi8( -// CHECK: call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %{{.*}}) +// CHECK-LABEL: test_mm_reduce_or_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_or_epi8(__W); } signed char test_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_add_epi8( +// CHECK-LABEL: test_mm_mask_reduce_add_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_add_epi8(__M, __W); } signed char test_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_mul_epi8( +// CHECK-LABEL: test_mm_mask_reduce_mul_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_mul_epi8(__M, __W); } signed char test_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_and_epi8( +// CHECK-LABEL: test_mm_mask_reduce_and_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_and_epi8(__M, __W); } signed char test_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W){ -// CHECK-LABEL: @test_mm_mask_reduce_or_epi8( +// CHECK-LABEL: test_mm_mask_reduce_or_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_or_epi8(__M, __W); } signed char test_mm256_reduce_add_epi8(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_add_epi8( -// CHECK: call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_add_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_add_epi8(__W); } signed char test_mm256_reduce_mul_epi8(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_mul_epi8( -// CHECK: call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_mul_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_mul_epi8(__W); } signed char test_mm256_reduce_and_epi8(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_and_epi8( -// CHECK: call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_and_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_and_epi8(__W); } signed char test_mm256_reduce_or_epi8(__m256i __W){ -// CHECK-LABEL: @test_mm256_reduce_or_epi8( -// CHECK: call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %{{.*}}) +// CHECK-LABEL: test_mm256_reduce_or_epi8 +// CHECK: call {{.*}}i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_or_epi8(__W); } signed char test_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_add_epi8( +// CHECK-LABEL: test_mm256_mask_reduce_add_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_add_epi8(__M, __W); } signed char test_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_mul_epi8( +// CHECK-LABEL: test_mm256_mask_reduce_mul_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_mul_epi8(__M, __W); } signed char test_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_and_epi8( +// CHECK-LABEL: test_mm256_mask_reduce_and_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_and_epi8(__M, __W); } signed char test_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W){ -// CHECK-LABEL: @test_mm256_mask_reduce_or_epi8( +// CHECK-LABEL: test_mm256_mask_reduce_or_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_or_epi8(__M, __W); } short test_mm_reduce_max_epi16(__m128i __W){ // CHECK-LABEL: test_mm_reduce_max_epi16 -// CHECK: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_max_epi16(__W); } short test_mm_reduce_min_epi16(__m128i __W){ // CHECK-LABEL: test_mm_reduce_min_epi16 -// CHECK: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_min_epi16(__W); } unsigned short test_mm_reduce_max_epu16(__m128i __W){ // CHECK-LABEL: test_mm_reduce_max_epu16 -// CHECK: call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_max_epu16(__W); } unsigned short test_mm_reduce_min_epu16(__m128i __W){ // CHECK-LABEL: test_mm_reduce_min_epu16 -// CHECK: call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %{{.*}}) return _mm_reduce_min_epu16(__W); } short test_mm_mask_reduce_max_epi16(__mmask8 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_max_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_max_epi16(__M, __W); } short test_mm_mask_reduce_min_epi16(__mmask8 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_min_epi16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_min_epi16(__M, __W); } unsigned short test_mm_mask_reduce_max_epu16(__mmask8 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_max_epu16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_max_epu16(__M, __W); } unsigned short test_mm_mask_reduce_min_epu16(__mmask8 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_min_epu16 // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %{{.*}}) return _mm_mask_reduce_min_epu16(__M, __W); } short test_mm256_reduce_max_epi16(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_max_epi16 -// CHECK: call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_max_epi16(__W); } short test_mm256_reduce_min_epi16(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_min_epi16 -// CHECK: call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_min_epi16(__W); } unsigned short test_mm256_reduce_max_epu16(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_max_epu16 -// CHECK: call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_max_epu16(__W); } unsigned short test_mm256_reduce_min_epu16(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_min_epu16 -// CHECK: call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %{{.*}}) return _mm256_reduce_min_epu16(__W); } short test_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_max_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_max_epi16(__M, __W); } short test_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_min_epi16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_min_epi16(__M, __W); } unsigned short test_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_max_epu16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_max_epu16(__M, __W); } unsigned short test_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_min_epu16 // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} -// CHECK: call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %{{.*}}) +// CHECK: call {{.*}}i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %{{.*}}) return _mm256_mask_reduce_min_epu16(__M, __W); } signed char test_mm_reduce_max_epi8(__m128i __W){ // CHECK-LABEL: test_mm_reduce_max_epi8 -// CHECK: call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_max_epi8(__W); } signed char test_mm_reduce_min_epi8(__m128i __W){ // CHECK-LABEL: test_mm_reduce_min_epi8 -// CHECK: call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_min_epi8(__W); } unsigned char test_mm_reduce_max_epu8(__m128i __W){ // CHECK-LABEL: test_mm_reduce_max_epu8 -// CHECK: call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_max_epu8(__W); } unsigned char test_mm_reduce_min_epu8(__m128i __W){ // CHECK-LABEL: test_mm_reduce_min_epu8 -// CHECK: call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %{{.*}}) return _mm_reduce_min_epu8(__W); } signed char test_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_max_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_max_epi8(__M, __W); } signed char test_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_min_epi8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_min_epi8(__M, __W); } unsigned char test_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_max_epu8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_max_epu8(__M, __W); } unsigned char test_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __W){ // CHECK-LABEL: test_mm_mask_reduce_min_epu8 // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %{{.*}}) return _mm_mask_reduce_min_epu8(__M, __W); } signed char test_mm256_reduce_max_epi8(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_max_epi8 -// CHECK: call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_max_epi8(__W); } signed char test_mm256_reduce_min_epi8(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_min_epi8 -// CHECK: call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_min_epi8(__W); } unsigned char test_mm256_reduce_max_epu8(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_max_epu8 -// CHECK: call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_max_epu8(__W); } unsigned char test_mm256_reduce_min_epu8(__m256i __W){ // CHECK-LABEL: test_mm256_reduce_min_epu8 -// CHECK: call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %{{.*}}) return _mm256_reduce_min_epu8(__W); } signed char test_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_max_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_max_epi8(__M, __W); } signed char test_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_min_epi8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_min_epi8(__M, __W); } unsigned char test_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_max_epu8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_max_epu8(__M, __W); } unsigned char test_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __W){ // CHECK-LABEL: test_mm256_mask_reduce_min_epu8 // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} -// CHECK: call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %{{.*}}) +// CHECK: call {{.*}}i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %{{.*}}) return _mm256_mask_reduce_min_epu8(__M, __W); } diff --git a/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c b/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c index 8927ae273d29..b80ffdf0dcc7 100644 --- a/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c +++ b/clang/test/CodeGen/X86/avx512vpopcntdq-builtins.c @@ -19,6 +19,7 @@ __m512i test_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_popcnt_epi64(__W, __U, __A); } +TEST_CONSTEXPR(match_v8di(_mm512_mask_popcnt_epi64(_mm512_set1_epi64(-1), 0x81, (__m512i)(__v8di){+5, -3, -10, +8, 0, -256, +256, -128}), 2, -1, -1, -1, -1, -1, -1, 57)); __m512i test_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_popcnt_epi64 @@ -26,6 +27,7 @@ __m512i test_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { // CHECK: select <8 x i1> %{{[0-9]+}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_popcnt_epi64(__U, __A); } +TEST_CONSTEXPR(match_v8di(_mm512_maskz_popcnt_epi64(0x42, (__m512i)(__v8di){+5, -3, -10, +8, 0, -256, +256, -128}), 0, 63, 0, 0, 0, 0, 1, 0)); __m512i test_mm512_popcnt_epi32(__m512i __A) { // CHECK-LABEL: test_mm512_popcnt_epi32 @@ -40,6 +42,7 @@ __m512i test_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_popcnt_epi32(__W, __U, __A); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_popcnt_epi32(_mm512_set1_epi32(-1), 0x0F81, (__m512i)(__v16si){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), 2, -1, -1, -1, -1, -1, -1, 25, 2, 2, 4, 2, -1, -1, -1, -1)); __m512i test_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_popcnt_epi32 @@ -47,3 +50,4 @@ __m512i test_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { // CHECK: select <16 x i1> %{{[0-9]+}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_popcnt_epi32(__U, __A); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_popcnt_epi32(0xF042, (__m512i)(__v16si){+5, -3, -10, +8, 0, -256, +256, -128, +3, +9, +15, +33, +63, +129, +511, +1025}), 0, 31, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 6, 2, 9, 2)); diff --git a/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c b/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c index d9fbd7628142..7258034c57ad 100644 --- a/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vpopcntdqvl-builtins.c @@ -19,6 +19,7 @@ __m128i test_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK: select <2 x i1> %{{.+}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_mask_popcnt_epi64(__W, __U, __A); } +TEST_CONSTEXPR(match_v2di(_mm_mask_popcnt_epi64(_mm_set1_epi64x(-1), 0x2, (__m128i)(__v2di){+5, -3}), -1, 63)); __m128i test_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_popcnt_epi64 @@ -26,6 +27,7 @@ __m128i test_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { // CHECK: select <2 x i1> %{{.+}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm_maskz_popcnt_epi64(__U, __A); } +TEST_CONSTEXPR(match_v2di(_mm_maskz_popcnt_epi64(0x1, (__m128i)(__v2di){+5, -3}), 2, 0)); __m128i test_mm_popcnt_epi32(__m128i __A) { // CHECK-LABEL: test_mm_popcnt_epi32 @@ -40,6 +42,7 @@ __m128i test_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { // CHECK: select <4 x i1> %{{.+}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_mask_popcnt_epi32(__W, __U, __A); } +TEST_CONSTEXPR(match_v4si(_mm_mask_popcnt_epi32(_mm_set1_epi32(-1), 0x3, (__m128i)(__v4si){+5, -3, -10, +8}), 2, 31, -1, -1)); __m128i test_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { // CHECK-LABEL: test_mm_maskz_popcnt_epi32 @@ -47,6 +50,7 @@ __m128i test_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { // CHECK: select <4 x i1> %{{.+}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm_maskz_popcnt_epi32(__U, __A); } +TEST_CONSTEXPR(match_v4si(_mm_maskz_popcnt_epi32(0x5, (__m128i)(__v4si){+5, -3, -10, +8}), 2, 0, 30, 0)); __m256i test_mm256_popcnt_epi64(__m256i __A) { // CHECK-LABEL: test_mm256_popcnt_epi64 @@ -61,6 +65,7 @@ __m256i test_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { // CHECK: select <4 x i1> %{{.+}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_popcnt_epi64(__W, __U, __A); } +TEST_CONSTEXPR(match_v4di(_mm256_mask_popcnt_epi64(_mm256_set1_epi64x(-1), 0x3, (__m256i)(__v4di){+5, -3, -10, +8}), 2, 63, -1, -1)); __m256i test_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_popcnt_epi64 @@ -68,6 +73,7 @@ __m256i test_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { // CHECK: select <4 x i1> %{{.+}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_popcnt_epi64(__U, __A); } +TEST_CONSTEXPR(match_v4di(_mm256_maskz_popcnt_epi64(0x5, (__m256i)(__v4di){+5, -3, -10, +8}), 2, 0, 62, 0)); __m256i test_mm256_popcnt_epi32(__m256i __A) { // CHECK-LABEL: test_mm256_popcnt_epi32 @@ -82,6 +88,7 @@ __m256i test_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { // CHECK: select <8 x i1> %{{.+}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_popcnt_epi32(__W, __U, __A); } +TEST_CONSTEXPR(match_v8si(_mm256_mask_popcnt_epi32(_mm256_set1_epi32(-1), 0x37, (__m256i)(__v8si){+5, -3, -10, +8, 0, -256, +256, -128}), 2, 31, 30, -1, 0, 24, -1, -1)); __m256i test_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_popcnt_epi32 @@ -89,3 +96,4 @@ __m256i test_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { // CHECK: select <8 x i1> %{{.+}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_popcnt_epi32(__U, __A); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_popcnt_epi32(0x8C, (__m256i)(__v8si){+5, -3, -10, +8, 0, -256, +256, -128}), 0, 0, 30, 1, 0, 0, 0, 25)); diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index a4098c8db4f3..b9682dade0c9 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -389,6 +389,7 @@ __m64 test_mm_mul_su32(__m64 a, __m64 b) { // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_su32(a, b); } +TEST_CONSTEXPR(match_m64(_mm_mul_su32((__m64)(__v2si){+1, -2}, (__m64)(__v2si){-10, +8}), 4294967286)); __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pi16 diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index 49d8b395a0d5..38d5e877a503 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -925,6 +925,7 @@ __m128i test_mm_mul_epu32(__m128i A, __m128i B) { // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_epu32(A, B); } +TEST_CONSTEXPR(match_m128i(_mm_mul_epu32((__m128i)(__v4si){+1, -2, +3, -4}, (__m128i)(__v4si){-16, -14, +12, +10}), 4294967280, 36)); __m128d test_mm_mul_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_mul_pd diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index 10deb386d82a..500b780d4905 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -344,6 +344,7 @@ __m128i test_mm_mul_epi32(__m128i x, __m128i y) { // CHECK: mul <2 x i64> %{{.*}}, %{{.*}} return _mm_mul_epi32(x, y); } +TEST_CONSTEXPR(match_m128i(_mm_mul_epi32((__m128i)(__v4si){+1, -2, +3, -4}, (__m128i)(__v4si){-16, -14, +12, +10}), -16, 36)); __m128i test_mm_mullo_epi32(__m128i x, __m128i y) { // CHECK-LABEL: test_mm_mullo_epi32 diff --git a/clang/test/CodeGen/object-size.cpp b/clang/test/CodeGen/object-size.cpp index e6ae3aefac2f..39c0f35e79e2 100644 --- a/clang/test/CodeGen/object-size.cpp +++ b/clang/test/CodeGen/object-size.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s // C++-specific tests for __builtin_object_size diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c index 2503f2473d64..f1a1359b5f55 100644 --- a/clang/test/Driver/print-supported-extensions-riscv.c +++ b/clang/test/Driver/print-supported-extensions-riscv.c @@ -128,6 +128,7 @@ // CHECK-NEXT: smcdeleg 1.0 'Smcdeleg' (Counter Delegation Machine Level) // CHECK-NEXT: smcntrpmf 1.0 'Smcntrpmf' (Cycle and Instret Privilege Mode Filtering) // CHECK-NEXT: smcsrind 1.0 'Smcsrind' (Indirect CSR Access Machine Level) +// CHECK-NEXT: smctr 1.0 'Smctr' (Control Transfer Records Machine Level) // CHECK-NEXT: smdbltrp 1.0 'Smdbltrp' (Double Trap Machine Level) // CHECK-NEXT: smepmp 1.0 'Smepmp' (Enhanced Physical Memory Protection) // CHECK-NEXT: smmpm 1.0 'Smmpm' (Machine-level Pointer Masking for M-mode) @@ -140,6 +141,7 @@ // CHECK-NEXT: sscofpmf 1.0 'Sscofpmf' (Count Overflow and Mode-Based Filtering) // CHECK-NEXT: sscounterenw 1.0 'Sscounterenw' (Support writeable scounteren enable bit for any hpmcounter that is not read-only zero) // CHECK-NEXT: sscsrind 1.0 'Sscsrind' (Indirect CSR Access Supervisor Level) +// CHECK-NEXT: ssctr 1.0 'Ssctr' (Control Transfer Records Supervisor Level) // CHECK-NEXT: ssdbltrp 1.0 'Ssdbltrp' (Double Trap Supervisor Level) // CHECK-NEXT: ssnpm 1.0 'Ssnpm' (Supervisor-level Pointer Masking for next lower privilege mode) // CHECK-NEXT: sspm 1.0 'Sspm' (Indicates Supervisor-mode Pointer Masking) @@ -192,6 +194,7 @@ // CHECK-NEXT: xsfvqmaccqoq 1.0 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)) // CHECK-NEXT: xsifivecdiscarddlone 1.0 'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction) // CHECK-NEXT: xsifivecflushdlone 1.0 'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction) +// CHECK-NEXT: xsmtvdot 1.0 'XSMTVDot' (SpacemiT Vector Dot Product Extension) // CHECK-NEXT: xtheadba 1.0 'XTHeadBa' (T-Head address calculation instructions) // CHECK-NEXT: xtheadbb 1.0 'XTHeadBb' (T-Head basic bit-manipulation instructions) // CHECK-NEXT: xtheadbs 1.0 'XTHeadBs' (T-Head single-bit instructions) @@ -214,8 +217,6 @@ // CHECK-NEXT: zvbc32e 0.7 'Zvbc32e' (Vector Carryless Multiplication with 32-bits elements) // CHECK-NEXT: zvkgs 0.7 'Zvkgs' (Vector-Scalar GCM instructions for Cryptography) // CHECK-NEXT: zvqdotq 0.0 'Zvqdotq' (Vector quad widening 4D Dot Product) -// CHECK-NEXT: smctr 1.0 'Smctr' (Control Transfer Records Machine Level) -// CHECK-NEXT: ssctr 1.0 'Ssctr' (Control Transfer Records Supervisor Level) // CHECK-NEXT: svukte 0.3 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses) // CHECK-NEXT: xqccmp 0.3 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves) // CHECK-NEXT: xqcia 0.7 'Xqcia' (Qualcomm uC Arithmetic Extension) diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c index 2a9c4e7021fe..ea0821cc39c4 100644 --- a/clang/test/Driver/riscv-cpus.c +++ b/clang/test/Driver/riscv-cpus.c @@ -157,6 +157,7 @@ // MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svinval" // MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svnapot" // MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svpbmt" +// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+xsmtvdot" // MCPU-SPACEMIT-X60-SAME: "-target-abi" "lp64d" // We cannot check much for -mcpu=native, but it should be replaced by a valid CPU string. diff --git a/clang/test/Interpreter/assignment-with-implicit-ctor.cpp b/clang/test/Interpreter/assignment-with-implicit-ctor.cpp index 24cea8ec1a4b..cef568c78663 100644 --- a/clang/test/Interpreter/assignment-with-implicit-ctor.cpp +++ b/clang/test/Interpreter/assignment-with-implicit-ctor.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // // RUN: cat %s | clang-repl | FileCheck %s // RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s diff --git a/clang/test/Interpreter/bad_percent_command.cpp b/clang/test/Interpreter/bad_percent_command.cpp index 95bebeab58d3..207570cac187 100644 --- a/clang/test/Interpreter/bad_percent_command.cpp +++ b/clang/test/Interpreter/bad_percent_command.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl 2>&1 | FileCheck %s %foobar // CHECK: Invalid % command "%foobar", use "%help" to list commands diff --git a/clang/test/Interpreter/code-undo.cpp b/clang/test/Interpreter/code-undo.cpp index 83ade0ec9158..4516910ca3b4 100644 --- a/clang/test/Interpreter/code-undo.cpp +++ b/clang/test/Interpreter/code-undo.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl | FileCheck %s extern "C" int printf(const char *, ...); int x1 = 0; diff --git a/clang/test/Interpreter/const.cpp b/clang/test/Interpreter/const.cpp index 52be75e09ade..cadd446b7504 100644 --- a/clang/test/Interpreter/const.cpp +++ b/clang/test/Interpreter/const.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-aix, system-zos // see https://github.com/llvm/llvm-project/issues/68092 // XFAIL: host={{.*}}-windows-msvc diff --git a/clang/test/Interpreter/cxx20-modules.cppm b/clang/test/Interpreter/cxx20-modules.cppm index 4e56e2fc1528..97744e3b25f7 100644 --- a/clang/test/Interpreter/cxx20-modules.cppm +++ b/clang/test/Interpreter/cxx20-modules.cppm @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit, x86_64-linux -// UNSUPPORTED: system-aix // // RUN: rm -rf %t // RUN: mkdir -p %t diff --git a/clang/test/Interpreter/dynamic-library-bad-args.cpp b/clang/test/Interpreter/dynamic-library-bad-args.cpp index 7684a8b746c9..f48a2657beac 100644 --- a/clang/test/Interpreter/dynamic-library-bad-args.cpp +++ b/clang/test/Interpreter/dynamic-library-bad-args.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl 2>&1 | FileCheck %s %lib // CHECK: %lib expects 1 argument: the path to a dynamic library diff --git a/clang/test/Interpreter/execute-stmts.cpp b/clang/test/Interpreter/execute-stmts.cpp index 433c6811777d..cc27fa615f71 100644 --- a/clang/test/Interpreter/execute-stmts.cpp +++ b/clang/test/Interpreter/execute-stmts.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl -Xcc -Xclang -Xcc -verify | FileCheck %s // RUN: %clang_cc1 -verify -fincremental-extensions -emit-llvm -o - %s \ // RUN: | FileCheck --check-prefix=CODEGEN-CHECK %s diff --git a/clang/test/Interpreter/execute-weak.cpp b/clang/test/Interpreter/execute-weak.cpp index 85fa5d276f5f..f469451f5e5e 100644 --- a/clang/test/Interpreter/execute-weak.cpp +++ b/clang/test/Interpreter/execute-weak.cpp @@ -1,4 +1,4 @@ -// UNSUPPORTED: system-aix, system-windows +// UNSUPPORTED: system-windows // RUN: cat %s | clang-repl | FileCheck %s extern "C" int printf(const char *, ...); diff --git a/clang/test/Interpreter/execute.c b/clang/test/Interpreter/execute.c index 44a3a32c9301..ca8f83cf6e37 100644 --- a/clang/test/Interpreter/execute.c +++ b/clang/test/Interpreter/execute.c @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl -Xcc -xc -Xcc -Xclang -Xcc -verify | FileCheck %s // RUN: cat %s | clang-repl -Xcc -xc -Xcc -O2 -Xcc -Xclang -Xcc -verify| FileCheck %s diff --git a/clang/test/Interpreter/execute.cpp b/clang/test/Interpreter/execute.cpp index 534a54ed94fb..82cd70a93980 100644 --- a/clang/test/Interpreter/execute.cpp +++ b/clang/test/Interpreter/execute.cpp @@ -1,5 +1,3 @@ -// UNSUPPORTED: system-aix - // clang-format off // RUN: clang-repl "int i = 10;" 'extern "C" int printf(const char*,...);' \ // RUN: 'auto r1 = printf("i = %d\n", i);' | FileCheck --check-prefix=CHECK-DRIVER %s diff --git a/clang/test/Interpreter/fail.cpp b/clang/test/Interpreter/fail.cpp index 4963df8c54a4..d92debc25354 100644 --- a/clang/test/Interpreter/fail.cpp +++ b/clang/test/Interpreter/fail.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // clang-repl can be called from the prompt in non-interactive mode as a // calculator in shell scripts, for example. In that case if there is an error // we should set the exit code as failure. diff --git a/clang/test/Interpreter/global-dtor.cpp b/clang/test/Interpreter/global-dtor.cpp index 1f241d9f1931..9cb454b06c17 100644 --- a/clang/test/Interpreter/global-dtor.cpp +++ b/clang/test/Interpreter/global-dtor.cpp @@ -1,5 +1,4 @@ // clang-format off -// UNSUPPORTED: system-aix // // Tests that a global destructor is ran on platforms with gnu exception support. // @@ -10,4 +9,4 @@ extern "C" int printf(const char *, ...); struct D { float f = 1.0; D *m = nullptr; D(){} ~D() { printf("D[f=%f, m=0x%llx]\n", f, reinterpret_cast(m)); }} d; // CHECK: D[f=1.000000, m=0x0] -%quit \ No newline at end of file +%quit diff --git a/clang/test/Interpreter/help.cpp b/clang/test/Interpreter/help.cpp index 5573fb4284c6..70f114f4644e 100644 --- a/clang/test/Interpreter/help.cpp +++ b/clang/test/Interpreter/help.cpp @@ -1,4 +1,3 @@ -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl | FileCheck %s %help // CHECK: %help list clang-repl %commands diff --git a/clang/test/Interpreter/incremental-mode.cpp b/clang/test/Interpreter/incremental-mode.cpp index 71ff794872b2..d63cee0dd6d1 100644 --- a/clang/test/Interpreter/incremental-mode.cpp +++ b/clang/test/Interpreter/incremental-mode.cpp @@ -1,5 +1,3 @@ -// UNSUPPORTED: system-aix -// // RUN: clang-repl -Xcc -E // RUN: clang-repl -Xcc -emit-llvm // RUN: clang-repl -Xcc -xc diff --git a/clang/test/Interpreter/inline-asm.cpp b/clang/test/Interpreter/inline-asm.cpp index f94f14df72f8..6d071b1ef207 100644 --- a/clang/test/Interpreter/inline-asm.cpp +++ b/clang/test/Interpreter/inline-asm.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit, x86_64-linux -// UNSUPPORTED: system-aix // // RUN: rm -rf %t // RUN: mkdir -p %t diff --git a/clang/test/Interpreter/inline-virtual.cpp b/clang/test/Interpreter/inline-virtual.cpp index c9e85683d3cd..3790c110af09 100644 --- a/clang/test/Interpreter/inline-virtual.cpp +++ b/clang/test/Interpreter/inline-virtual.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // // This test is flaky with ASan: https://github.com/llvm/llvm-project/issues/135401 // UNSUPPORTED: asan diff --git a/clang/test/Interpreter/lambda.cpp b/clang/test/Interpreter/lambda.cpp index db8c9db9b344..e6941087e947 100644 --- a/clang/test/Interpreter/lambda.cpp +++ b/clang/test/Interpreter/lambda.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl | FileCheck %s // At -O2, somehow "x = 42" appears first when piped into FileCheck, // see https://github.com/llvm/llvm-project/issues/143547. @@ -27,4 +26,4 @@ auto capture = [&]() { return x * 2; }; printf("x = %d\n", x); // CHECK: x = 42 -%quit \ No newline at end of file +%quit diff --git a/clang/test/Interpreter/lit.local.cfg b/clang/test/Interpreter/lit.local.cfg index ac6d2205e9fc..37af5129d4ee 100644 --- a/clang/test/Interpreter/lit.local.cfg +++ b/clang/test/Interpreter/lit.local.cfg @@ -1,2 +1,6 @@ -if "host-supports-jit" not in config.available_features: +# clang-repl is not supported on AIX and zOS +unsupported_platforms = [ "system-aix", "system-zos" ] + +if "host-supports-jit" not in config.available_features or \ + any(up in config.available_features for up in unsupported_platforms): config.unsupported = True diff --git a/clang/test/Interpreter/multiline.cpp b/clang/test/Interpreter/multiline.cpp index 0f5ef48417f1..a9f1455fc94a 100644 --- a/clang/test/Interpreter/multiline.cpp +++ b/clang/test/Interpreter/multiline.cpp @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl -Xcc -Xclang -Xcc -verify | FileCheck %s // expected-no-diagnostics diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c index e1408c035a16..588df70e33e8 100644 --- a/clang/test/Interpreter/pretty-print.c +++ b/clang/test/Interpreter/pretty-print.c @@ -1,5 +1,4 @@ // REQUIRES: host-supports-jit -// UNSUPPORTED: system-aix // RUN: cat %s | clang-repl -Xcc -xc | FileCheck %s // RUN: cat %s | clang-repl -Xcc -std=c++11 | FileCheck %s diff --git a/clang/test/Interpreter/pretty-print.cpp b/clang/test/Interpreter/pretty-print.cpp index e1036ab87df9..bad71cdd48f0 100644 --- a/clang/test/Interpreter/pretty-print.cpp +++ b/clang/test/Interpreter/pretty-print.cpp @@ -1,7 +1,7 @@ // RUN: clang-repl "int i = 10;" 'extern "C" int printf(const char*,...);' \ // RUN: 'auto r1 = printf("i = %d\n", i);' | FileCheck --check-prefix=CHECK-DRIVER %s // The test is flaky with asan https://github.com/llvm/llvm-project/pull/148701. -// UNSUPPORTED: system-aix, asan +// UNSUPPORTED: asan // CHECK-DRIVER: i = 10 // RUN: cat %s | clang-repl -Xcc -std=c++11 -Xcc -fno-delayed-template-parsing | FileCheck %s extern "C" int printf(const char*,...); diff --git a/clang/test/Interpreter/simple-exception.cpp b/clang/test/Interpreter/simple-exception.cpp index 8f7b515c142b..2d43f807d7b9 100644 --- a/clang/test/Interpreter/simple-exception.cpp +++ b/clang/test/Interpreter/simple-exception.cpp @@ -1,5 +1,4 @@ // clang-format off -// UNSUPPORTED: system-aix // XFAIL for arm, or running on Windows. // XFAIL: target=arm-{{.*}}, target=armv{{.*}}, system-windows, system-cygwin // RUN: cat %s | clang-repl | FileCheck %s diff --git a/clang/test/Preprocessor/sanitizer-predefines.c b/clang/test/Preprocessor/sanitizer-predefines.c new file mode 100644 index 000000000000..9d2f6bf2517a --- /dev/null +++ b/clang/test/Preprocessor/sanitizer-predefines.c @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -E -dM -triple aarch64-unknown-linux -fsanitize=address %s | FileCheck %s --check-prefix=ASAN +// ASAN: #define __SANITIZE_ADDRESS__ 1 + +// RUN: %clang_cc1 -E -dM -triple aarch64-unknown-linux -fsanitize=hwaddress %s | FileCheck %s --check-prefix=HWASAN +// HWASAN: #define __SANITIZE_HWADDRESS__ 1 + +// RUN: %clang_cc1 -E -dM -triple aarch64-unknown-linux -fsanitize=thread %s | FileCheck %s --check-prefix=TSAN +// TSAN: #define __SANITIZE_THREAD__ 1 diff --git a/clang/test/Sema/builtin-object-size.c b/clang/test/Sema/builtin-object-size.c index 20d4e2ab6da7..a763c24fd662 100644 --- a/clang/test/Sema/builtin-object-size.c +++ b/clang/test/Sema/builtin-object-size.c @@ -2,6 +2,10 @@ // RUN: %clang_cc1 -fsyntax-only -triple x86_64-apple-darwin9 -verify %s // RUN: %clang_cc1 -DDYNAMIC -fsyntax-only -triple x86_64-apple-darwin9 -verify %s +// RUN: %clang_cc1 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -fsyntax-only -triple x86_64-apple-darwin9 -verify %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -DDYNAMIC -fsyntax-only -triple x86_64-apple-darwin9 -verify %s -fexperimental-new-constant-interpreter + #ifndef DYNAMIC #define OBJECT_SIZE_BUILTIN __builtin_object_size #else diff --git a/clang/test/Sema/designated-initializers.c b/clang/test/Sema/designated-initializers.c index 31a3380b5db7..11dc3a2308de 100644 --- a/clang/test/Sema/designated-initializers.c +++ b/clang/test/Sema/designated-initializers.c @@ -368,3 +368,10 @@ struct { .b = 0, // expected-warning {{initializer overrides prior initialization of this subobject}} }, }; + +void gh154046(void) { + (void)(const char[]) { + [0] = "", // expected-error {{incompatible pointer to integer conversion initializing 'const char' with an expression of type 'char[1]'}} + [1] = "" // expected-error {{incompatible pointer to integer conversion initializing 'const char' with an expression of type 'char[1]'}} + }[1]; +} diff --git a/clang/test/Sema/warn-lifetime-safety.cpp b/clang/test/Sema/warn-lifetime-safety.cpp new file mode 100644 index 000000000000..660b9c9d5e24 --- /dev/null +++ b/clang/test/Sema/warn-lifetime-safety.cpp @@ -0,0 +1,273 @@ +// RUN: %clang_cc1 -fsyntax-only -fexperimental-lifetime-safety -Wexperimental-lifetime-safety -verify %s + +struct MyObj { + int id; + ~MyObj() {} // Non-trivial destructor + MyObj operator+(MyObj); +}; + +//===----------------------------------------------------------------------===// +// Basic Definite Use-After-Free (-W...permissive) +// These are cases where the pointer is guaranteed to be dangling at the use site. +//===----------------------------------------------------------------------===// + +void definite_simple_case() { + MyObj* p; + { + MyObj s; + p = &s; // expected-warning {{object whose reference is captured does not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} +} + +void no_use_no_error() { + MyObj* p; + { + MyObj s; + p = &s; + } +} + +void definite_pointer_chain() { + MyObj* p; + MyObj* q; + { + MyObj s; + p = &s; // expected-warning {{does not live long enough}} + q = p; + } // expected-note {{destroyed here}} + (void)*q; // expected-note {{later used here}} +} + +void definite_multiple_uses_one_warning() { + MyObj* p; + { + MyObj s; + p = &s; // expected-warning {{does not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} + // No second warning for the same loan. + p->id = 1; + MyObj* q = p; + (void)*q; +} + +void definite_multiple_pointers() { + MyObj *p, *q, *r; + { + MyObj s; + p = &s; // expected-warning {{does not live long enough}} + q = &s; // expected-warning {{does not live long enough}} + r = &s; // expected-warning {{does not live long enough}} + } // expected-note 3 {{destroyed here}} + (void)*p; // expected-note {{later used here}} + (void)*q; // expected-note {{later used here}} + (void)*r; // expected-note {{later used here}} +} + +void definite_single_pointer_multiple_loans(bool cond) { + MyObj *p; + if (cond){ + MyObj s; + p = &s; // expected-warning {{does not live long enough}} + } // expected-note {{destroyed here}} + else { + MyObj t; + p = &t; // expected-warning {{does not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note 2 {{later used here}} +} + + +//===----------------------------------------------------------------------===// +// Potential (Maybe) Use-After-Free (-W...strict) +// These are cases where the pointer *may* become dangling, depending on the path taken. +//===----------------------------------------------------------------------===// + +void potential_if_branch(bool cond) { + MyObj safe; + MyObj* p = &safe; + if (cond) { + MyObj temp; + p = &temp; // expected-warning {{object whose reference is captured may not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} +} + +// If all paths lead to a dangle, it becomes a definite error. +void potential_becomes_definite(bool cond) { + MyObj* p; + if (cond) { + MyObj temp1; + p = &temp1; // expected-warning {{does not live long enough}} + } // expected-note {{destroyed here}} + else { + MyObj temp2; + p = &temp2; // expected-warning {{does not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note 2 {{later used here}} +} + +void definite_potential_together(bool cond) { + MyObj safe; + MyObj* p_maybe = &safe; + MyObj* p_definite = nullptr; + + { + MyObj s; + p_definite = &s; // expected-warning {{does not live long enough}} + if (cond) { + p_maybe = &s; // expected-warning {{may not live long enough}} + } + } // expected-note 2 {{destroyed here}} + (void)*p_definite; // expected-note {{later used here}} + (void)*p_maybe; // expected-note {{later used here}} +} + +void definite_overrides_potential(bool cond) { + MyObj safe; + MyObj* p; + MyObj* q; + { + MyObj s; + q = &s; // expected-warning {{does not live long enough}} + p = q; + } // expected-note {{destroyed here}} + + if (cond) { + // 'q' is conditionally "rescued". 'p' is not. + q = &safe; + } + + // The use of 'p' is a definite error because it was never rescued. + (void)*q; + (void)*p; // expected-note {{later used here}} + (void)*q; +} + + +//===----------------------------------------------------------------------===// +// Control Flow Tests +//===----------------------------------------------------------------------===// + +void potential_for_loop_use_after_loop_body(MyObj safe) { + MyObj* p = &safe; + for (int i = 0; i < 1; ++i) { + MyObj s; + p = &s; // expected-warning {{may not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} +} + +void potential_for_loop_use_before_loop_body(MyObj safe) { + MyObj* p = &safe; + for (int i = 0; i < 1; ++i) { + (void)*p; // expected-note {{later used here}} + MyObj s; + p = &s; // expected-warning {{may not live long enough}} + } // expected-note {{destroyed here}} + (void)*p; +} + +void potential_loop_with_break(bool cond) { + MyObj safe; + MyObj* p = &safe; + for (int i = 0; i < 10; ++i) { + if (cond) { + MyObj temp; + p = &temp; // expected-warning {{may not live long enough}} + break; // expected-note {{destroyed here}} + } + } + (void)*p; // expected-note {{later used here}} +} + +void potential_multiple_expiry_of_same_loan(bool cond) { + // Choose the last expiry location for the loan. + MyObj safe; + MyObj* p = &safe; + for (int i = 0; i < 10; ++i) { + MyObj unsafe; + if (cond) { + p = &unsafe; // expected-warning {{may not live long enough}} + break; + } + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} + + p = &safe; + for (int i = 0; i < 10; ++i) { + MyObj unsafe; + if (cond) { + p = &unsafe; // expected-warning {{may not live long enough}} + if (cond) + break; + } + } // expected-note {{destroyed here}} + (void)*p; // expected-note {{later used here}} + + p = &safe; + for (int i = 0; i < 10; ++i) { + if (cond) { + MyObj unsafe2; + p = &unsafe2; // expected-warning {{may not live long enough}} + break; // expected-note {{destroyed here}} + } + } + (void)*p; // expected-note {{later used here}} +} + +void potential_switch(int mode) { + MyObj safe; + MyObj* p = &safe; + switch (mode) { + case 1: { + MyObj temp; + p = &temp; // expected-warning {{object whose reference is captured may not live long enough}} + break; // expected-note {{destroyed here}} + } + case 2: { + p = &safe; // This path is okay. + break; + } + } + (void)*p; // expected-note {{later used here}} +} + +void definite_switch(int mode) { + MyObj safe; + MyObj* p = &safe; + // All cases are UaF --> Definite error. + switch (mode) { + case 1: { + MyObj temp1; + p = &temp1; // expected-warning {{does not live long enough}} + break; // expected-note {{destroyed here}} + } + case 2: { + MyObj temp2; + p = &temp2; // expected-warning {{does not live long enough}} + break; // expected-note {{destroyed here}} + } + default: { + MyObj temp2; + p = &temp2; // expected-warning {{does not live long enough}} + break; // expected-note {{destroyed here}} + } + } + (void)*p; // expected-note 3 {{later used here}} +} + +//===----------------------------------------------------------------------===// +// No-Error Cases +//===----------------------------------------------------------------------===// +void no_error_if_dangle_then_rescue() { + MyObj safe; + MyObj* p; + { + MyObj temp; + p = &temp; // p is temporarily dangling. + } + p = &safe; // p is "rescued" before use. + (void)*p; // This is safe. +} diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp index c05130bb3072..1adb993f46c8 100644 --- a/clang/test/SemaCXX/new-delete.cpp +++ b/clang/test/SemaCXX/new-delete.cpp @@ -721,19 +721,7 @@ int (*const_fold)[12] = new int[3][&const_fold + 12 - &const_fold]; #if __cplusplus >= 201402L && !defined(NEW_INTERP) // expected-error@-2 {{array size is not a constant expression}} // expected-note@-3 {{cannot refer to element 12 of non-array}} -#elif __cplusplus < 201103L && !defined(NEW_INTERP) +#elif __cplusplus < 201103L // expected-error@-5 {{cannot allocate object of variably modified type}} // expected-warning@-6 {{variable length arrays in C++ are a Clang extension}} #endif -#ifdef NEW_INTERP -#if __cplusplus >= 201402L -// expected-error@-10 {{array size is not a constant expression}} -// expected-note@-11 {{cannot refer to element 12 of non-array}} -#elif __cplusplus >= 201103L -// expected-error@-13 {{only the first dimension of an allocated array may have dynamic size}} -// expected-note@-14 {{cannot refer to element 12 of non-array}} -#else -// expected-error@-16 {{only the first dimension of an allocated array may have dynamic size}} -// expected-note@-17 {{cannot refer to element 12 of non-array}} -#endif -#endif diff --git a/clang/test/SemaCXX/using-decl-templates.cpp b/clang/test/SemaCXX/using-decl-templates.cpp index c96c4879cb68..58b30595b148 100644 --- a/clang/test/SemaCXX/using-decl-templates.cpp +++ b/clang/test/SemaCXX/using-decl-templates.cpp @@ -153,3 +153,11 @@ T foo(T t) { // OK } } // namespace sss } // namespace func_templ + +namespace DependentName { + template struct S { + using typename T::Ty; + static Ty Val; + }; + template typename S::Ty S::Val; +} // DependentName diff --git a/clang/test/SemaCXX/warn-unused-result.cpp b/clang/test/SemaCXX/warn-unused-result.cpp index 447654eccd56..1f7913f1aa99 100644 --- a/clang/test/SemaCXX/warn-unused-result.cpp +++ b/clang/test/SemaCXX/warn-unused-result.cpp @@ -309,7 +309,7 @@ void use() { S(2); // no warning S(2); // expected-warning {{ignoring temporary of type 'S' declared with 'nodiscard'}} - S(2); // no warning (warn_unused_result does not diagnose constructor temporaries) + S(2); // expected-warning {{ignoring temporary of type 'S' declared with 'clang::warn_unused_result' attribute}} // function should take precedence over type obtain2(1.0); // expected-warning {{ignoring return value of function declared with 'nodiscard'}} @@ -336,7 +336,7 @@ struct [[nodiscard]] G { void use2() { H{2}; // no warning H(2.0); // expected-warning {{ignoring temporary created by a constructor declared with 'nodiscard'}} - H("Hello"); // no warning (warn_unused_result does not diagnose constructor temporaries) + H("Hello"); // expected-warning {{ignoring temporary created by a constructor declared with 'warn_unused_result' attribute}} // no warning for explicit cast to void (void)H(2); diff --git a/clang/test/SemaObjC/exprs.m b/clang/test/SemaObjC/exprs.m index dcf46d3cdbfb..c42d270657c1 100644 --- a/clang/test/SemaObjC/exprs.m +++ b/clang/test/SemaObjC/exprs.m @@ -36,3 +36,10 @@ void test_encode(void) { (void)@encode(Incomplete_ObjC_class*); (void)@encode(id); } + +void gh154046(void) { + (void)(const char[]) { + [0] = @encode(int), // expected-error {{incompatible pointer to integer conversion initializing 'const char' with an expression of type 'char[2]'}} + [1] = @encode(float) // expected-error {{incompatible pointer to integer conversion initializing 'const char' with an expression of type 'char[2]'}} + }[1]; +} diff --git a/clang/test/SemaOpenACC/gh154008.cpp b/clang/test/SemaOpenACC/gh154008.cpp new file mode 100644 index 000000000000..1ec114c000b3 --- /dev/null +++ b/clang/test/SemaOpenACC/gh154008.cpp @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +// expected-error@+2{{expected ';'}} +// expected-error@+1{{blocks support disabled}} +void *a = ^ { static int b }; diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt index 4ac563143cd6..3bd4a6e21bee 100644 --- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt +++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt @@ -8,6 +8,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests DataflowEnvironmentTest.cpp DebugSupportTest.cpp DeterminismTest.cpp + FormulaTest.cpp LoggerTest.cpp MapLatticeTest.cpp MatchSwitchTest.cpp diff --git a/clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp b/clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp index 4f7a72c502cc..92b687a5a18a 100644 --- a/clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp +++ b/clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp @@ -17,6 +17,9 @@ namespace { using namespace clang; using namespace dataflow; +using ::testing::IsEmpty; +using ::testing::UnorderedElementsAre; + class DataflowAnalysisContextTest : public ::testing::Test { protected: DataflowAnalysisContextTest() @@ -171,4 +174,97 @@ TEST_F(DataflowAnalysisContextTest, EquivBoolVals) { A.makeAnd(X, A.makeAnd(Y, Z)))); } +using ExportLogicalContextTest = DataflowAnalysisContextTest; + +TEST_F(ExportLogicalContextTest, EmptySet) { + EXPECT_THAT(Context.exportLogicalContext({}).TokenDefs, IsEmpty()); +} + +// Only constrainted tokens are included in the output. +TEST_F(ExportLogicalContextTest, UnconstrainedIgnored) { + Atom FC1 = A.makeFlowConditionToken(); + EXPECT_THAT(Context.exportLogicalContext({FC1}).TokenDefs, IsEmpty()); +} + +TEST_F(ExportLogicalContextTest, SingletonSet) { + Atom FC1 = A.makeFlowConditionToken(); + auto &C1 = A.makeAtomRef(A.makeAtom()); + Context.addFlowConditionConstraint(FC1, C1); + EXPECT_THAT(Context.exportLogicalContext({FC1}).TokenDefs.keys(), + UnorderedElementsAre(FC1)); +} + +TEST_F(ExportLogicalContextTest, NoDependency) { + Atom FC1 = A.makeFlowConditionToken(); + Atom FC2 = A.makeFlowConditionToken(); + Atom FC3 = A.makeFlowConditionToken(); + auto &C1 = A.makeAtomRef(A.makeAtom()); + auto &C2 = A.makeAtomRef(A.makeAtom()); + auto &C3 = A.makeAtomRef(A.makeAtom()); + + Context.addFlowConditionConstraint(FC1, C1); + Context.addFlowConditionConstraint(FC2, C2); + Context.addFlowConditionConstraint(FC3, C3); + + // FCs are independent. + EXPECT_THAT(Context.exportLogicalContext({FC1}).TokenDefs.keys(), + UnorderedElementsAre(FC1)); + EXPECT_THAT(Context.exportLogicalContext({FC2}).TokenDefs.keys(), + UnorderedElementsAre(FC2)); + EXPECT_THAT(Context.exportLogicalContext({FC3}).TokenDefs.keys(), + UnorderedElementsAre(FC3)); +} + +TEST_F(ExportLogicalContextTest, SimpleDependencyChain) { + Atom FC1 = A.makeFlowConditionToken(); + const Formula &C = A.makeAtomRef(A.makeAtom()); + Context.addFlowConditionConstraint(FC1, C); + Atom FC2 = Context.forkFlowCondition(FC1); + Atom FC3 = Context.forkFlowCondition(FC2); + + EXPECT_THAT(Context.exportLogicalContext({FC3}).TokenDefs.keys(), + UnorderedElementsAre(FC1, FC2, FC3)); +} + +TEST_F(ExportLogicalContextTest, DependencyTree) { + Atom FC1 = A.makeFlowConditionToken(); + const Formula &C = A.makeAtomRef(A.makeAtom()); + Context.addFlowConditionConstraint(FC1, C); + Atom FC2 = Context.forkFlowCondition(FC1); + Atom FC3 = A.makeFlowConditionToken(); + Context.addFlowConditionConstraint(FC3, C); + Atom FC4 = Context.joinFlowConditions(FC2, FC3); + + EXPECT_THAT(Context.exportLogicalContext({FC4}).TokenDefs.keys(), + UnorderedElementsAre(FC1, FC2, FC3, FC4)); +} + +TEST_F(ExportLogicalContextTest, DependencyDAG) { + Atom FC1 = A.makeFlowConditionToken(); + const Formula &C = A.makeAtomRef(A.makeAtom()); + Context.addFlowConditionConstraint(FC1, C); + + Atom FC2 = Context.forkFlowCondition(FC1); + Atom FC3 = Context.forkFlowCondition(FC1); + Atom FC4 = Context.joinFlowConditions(FC2, FC3); + + EXPECT_THAT(Context.exportLogicalContext({FC4}).TokenDefs.keys(), + UnorderedElementsAre(FC1, FC2, FC3, FC4)); +} + +TEST_F(ExportLogicalContextTest, MixedDependencies) { + Atom FC1 = A.makeFlowConditionToken(); + const Formula &C = A.makeAtomRef(A.makeAtom()); + Context.addFlowConditionConstraint(FC1, C); + + Atom FC2 = Context.forkFlowCondition(FC1); + Atom FC3 = Context.forkFlowCondition(FC2); + (void)FC3; // unused, and we test below that it is not included. + + Atom FC4 = A.makeFlowConditionToken(); + Context.addFlowConditionConstraint(FC4, C); + + EXPECT_THAT(Context.exportLogicalContext({FC2, FC4}).TokenDefs.keys(), + UnorderedElementsAre(FC1, FC2, FC4)); +} } // namespace diff --git a/clang/unittests/Analysis/FlowSensitive/FormulaTest.cpp b/clang/unittests/Analysis/FlowSensitive/FormulaTest.cpp new file mode 100644 index 000000000000..cabcd59fffed --- /dev/null +++ b/clang/unittests/Analysis/FlowSensitive/FormulaTest.cpp @@ -0,0 +1,201 @@ +//===- unittests/Analysis/FlowSensitive/FormulaTest.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/FlowSensitive/Formula.h" +#include "clang/Analysis/FlowSensitive/Arena.h" +#include "clang/Analysis/FlowSensitive/FormulaSerialization.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Testing/Support/Error.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace { + +using namespace clang; +using namespace dataflow; + +using ::llvm::Failed; +using ::llvm::HasValue; +using ::llvm::Succeeded; +using ::testing::ElementsAre; +using ::testing::IsEmpty; + +class SerializeFormulaTest : public ::testing::Test { +protected: + Arena A; + std::string Out; + llvm::raw_string_ostream OS{Out}; + + const Formula &A1 = A.makeAtomRef(A.makeAtom()); + const Formula &A2 = A.makeAtomRef(A.makeAtom()); +}; + +TEST_F(SerializeFormulaTest, Atom) { + serializeFormula(A1, OS); + EXPECT_EQ(Out, "V0"); + Out = ""; + + serializeFormula(A2, OS); + EXPECT_EQ(Out, "V1"); +} + +TEST_F(SerializeFormulaTest, LiteralTrue) { + serializeFormula(A.makeLiteral(true), OS); + EXPECT_EQ(Out, "T"); +} + +TEST_F(SerializeFormulaTest, LiteralFalse) { + serializeFormula(A.makeLiteral(false), OS); + EXPECT_EQ(Out, "F"); +} + +TEST_F(SerializeFormulaTest, Not) { + serializeFormula(A.makeNot(A1), OS); + EXPECT_EQ(Out, "!V0"); +} + +TEST_F(SerializeFormulaTest, Or) { + serializeFormula(A.makeOr(A1, A2), OS); + EXPECT_EQ(Out, "|V0V1"); +} + +TEST_F(SerializeFormulaTest, And) { + serializeFormula(A.makeAnd(A1, A2), OS); + EXPECT_EQ(Out, "&V0V1"); +} + +TEST_F(SerializeFormulaTest, Implies) { + serializeFormula(A.makeImplies(A1, A2), OS); + EXPECT_EQ(Out, ">V0V1"); +} + +TEST_F(SerializeFormulaTest, Equal) { + serializeFormula(A.makeEquals(A1, A2), OS); + EXPECT_EQ(Out, "=V0V1"); +} + +TEST_F(SerializeFormulaTest, NestedBinaryUnary) { + serializeFormula(A.makeEquals(A.makeOr(A1, A2), A2), OS); + EXPECT_EQ(Out, "=|V0V1V1"); +} + +TEST_F(SerializeFormulaTest, NestedBinaryBinary) { + serializeFormula(A.makeEquals(A.makeOr(A1, A2), A.makeAnd(A1, A2)), OS); + EXPECT_EQ(Out, "=|V0V1&V0V1"); +} + +class ParseFormulaTest : public ::testing::Test { +protected: + void SetUp() override { + AtomMap[0] = Atom1; + AtomMap[1] = Atom2; + } + + // Convenience wrapper for `testParseFormula`. + llvm::Expected testParseFormula(llvm::StringRef Str) { + return parseFormula(Str, A, AtomMap); + } + + Arena A; + std::string Out; + llvm::raw_string_ostream OS{Out}; + + Atom Atom1 = A.makeAtom(); + Atom Atom2 = A.makeAtom(); + const Formula &A1 = A.makeAtomRef(Atom1); + const Formula &A2 = A.makeAtomRef(Atom2); + llvm::DenseMap AtomMap; +}; + +TEST_F(ParseFormulaTest, Atom) { + EXPECT_THAT_EXPECTED(testParseFormula("V0"), HasValue(&A1)); + EXPECT_THAT_EXPECTED(testParseFormula("V1"), HasValue(&A2)); +} + +TEST_F(ParseFormulaTest, LiteralTrue) { + EXPECT_THAT_EXPECTED(testParseFormula("T"), HasValue(&A.makeLiteral(true))); +} + +TEST_F(ParseFormulaTest, LiteralFalse) { + EXPECT_THAT_EXPECTED(testParseFormula("F"), HasValue(&A.makeLiteral(false))); +} + +TEST_F(ParseFormulaTest, Not) { + EXPECT_THAT_EXPECTED(testParseFormula("!V0"), HasValue(&A.makeNot(A1))); +} + +TEST_F(ParseFormulaTest, Or) { + EXPECT_THAT_EXPECTED(testParseFormula("|V0V1"), HasValue(&A.makeOr(A1, A2))); +} + +TEST_F(ParseFormulaTest, And) { + EXPECT_THAT_EXPECTED(testParseFormula("&V0V1"), HasValue(&A.makeAnd(A1, A2))); +} + +TEST_F(ParseFormulaTest, OutOfNumericOrder) { + EXPECT_THAT_EXPECTED(testParseFormula("&V1V0"), HasValue(&A.makeAnd(A2, A1))); +} + +TEST_F(ParseFormulaTest, Implies) { + EXPECT_THAT_EXPECTED(testParseFormula(">V0V1"), + HasValue(&A.makeImplies(A1, A2))); +} + +TEST_F(ParseFormulaTest, Equal) { + EXPECT_THAT_EXPECTED(testParseFormula("=V0V1"), + HasValue(&A.makeEquals(A1, A2))); +} + +TEST_F(ParseFormulaTest, NestedBinaryUnary) { + EXPECT_THAT_EXPECTED(testParseFormula("=|V0V1V1"), + HasValue(&A.makeEquals(A.makeOr(A1, A2), A2))); +} + +TEST_F(ParseFormulaTest, NestedBinaryBinary) { + EXPECT_THAT_EXPECTED( + testParseFormula("=|V0V1&V0V1"), + HasValue(&A.makeEquals(A.makeOr(A1, A2), A.makeAnd(A1, A2)))); +} + +// Verifies that parsing generates fresh atoms, if they are not already in the +// map. +TEST_F(ParseFormulaTest, GeneratesAtoms) { + llvm::DenseMap FreshAtomMap; + ASSERT_THAT_EXPECTED(parseFormula("=V0V1", A, FreshAtomMap), Succeeded()); + // The map contains two, unique elements. + ASSERT_EQ(FreshAtomMap.size(), 2U); + EXPECT_NE(FreshAtomMap[0], FreshAtomMap[1]); +} + +TEST_F(ParseFormulaTest, MalformedFormulaFails) { + // Arbitrary string. + EXPECT_THAT_EXPECTED(testParseFormula("Hello"), Failed()); + // Empty string. + EXPECT_THAT_EXPECTED(testParseFormula(""), Failed()); + // Malformed atom. + EXPECT_THAT_EXPECTED(testParseFormula("Vabc"), Failed()); + // Irrelevant suffix. + EXPECT_THAT_EXPECTED(testParseFormula("V0Hello"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("=V0V1Hello"), Failed()); + // Sequence without operator. + EXPECT_THAT_EXPECTED(testParseFormula("TF"), Failed()); + // Bad subformula. + EXPECT_THAT_EXPECTED(testParseFormula("!G"), Failed()); + // Incomplete formulas. + EXPECT_THAT_EXPECTED(testParseFormula("V"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("&"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("|"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula(">"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("="), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("&V0"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("|V0"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula(">V0"), Failed()); + EXPECT_THAT_EXPECTED(testParseFormula("=V0"), Failed()); +} + +} // namespace diff --git a/clang/unittests/Analysis/LifetimeSafetyTest.cpp b/clang/unittests/Analysis/LifetimeSafetyTest.cpp index 7cd679e184f6..c8d88b4ea227 100644 --- a/clang/unittests/Analysis/LifetimeSafetyTest.cpp +++ b/clang/unittests/Analysis/LifetimeSafetyTest.cpp @@ -33,7 +33,9 @@ public: )"; FullCode += Code.str(); - AST = std::make_unique(FullCode); + Inputs = TestInputs(FullCode); + Inputs.Language = TestLanguage::Lang_CXX20; + AST = std::make_unique(Inputs); ASTCtx = &AST->context(); // Find the target function using AST matchers. @@ -51,7 +53,7 @@ public: BuildOptions.AddTemporaryDtors = true; // Run the main analysis. - Analysis = std::make_unique(*AnalysisCtx); + Analysis = std::make_unique(*AnalysisCtx, nullptr); Analysis->run(); AnnotationToPointMap = Analysis->getTestPoints(); @@ -70,6 +72,7 @@ public: } private: + TestInputs Inputs; std::unique_ptr AST; ASTContext *ASTCtx = nullptr; std::unique_ptr AnalysisCtx; @@ -118,11 +121,13 @@ public: return Analysis.getLoansAtPoint(OID, PP); } - std::optional getExpiredLoansAtPoint(llvm::StringRef Annotation) { + std::optional> + getExpiredLoansAtPoint(llvm::StringRef Annotation) { ProgramPoint PP = Runner.getProgramPoint(Annotation); if (!PP) return std::nullopt; - return Analysis.getExpiredLoansAtPoint(PP); + auto Expired = Analysis.getExpiredLoansAtPoint(PP); + return llvm::DenseSet{Expired.begin(), Expired.end()}; } private: diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp index 9de3cca71630..7c993c0f8fd3 100644 --- a/clang/unittests/Format/ConfigParseTest.cpp +++ b/clang/unittests/Format/ConfigParseTest.cpp @@ -200,7 +200,6 @@ TEST(ConfigParseTest, ParsesConfigurationBools) { CHECK_PARSE_BOOL(RemoveSemicolon); CHECK_PARSE_BOOL(SkipMacroDefinitionBody); CHECK_PARSE_BOOL(SpacesInSquareBrackets); - CHECK_PARSE_BOOL(SpaceInEmptyBlock); CHECK_PARSE_BOOL(SpacesInContainerLiterals); CHECK_PARSE_BOOL(SpaceAfterCStyleCast); CHECK_PARSE_BOOL(SpaceAfterTemplateKeyword); @@ -688,6 +687,17 @@ TEST(ConfigParseTest, ParsesConfiguration) { SpaceBeforeParens, FormatStyle::SBPO_ControlStatementsExceptControlMacros); + Style.SpaceInEmptyBraces = FormatStyle::SIEB_Never; + CHECK_PARSE("SpaceInEmptyBraces: Always", SpaceInEmptyBraces, + FormatStyle::SIEB_Always); + CHECK_PARSE("SpaceInEmptyBraces: Block", SpaceInEmptyBraces, + FormatStyle::SIEB_Block); + CHECK_PARSE("SpaceInEmptyBraces: Never", SpaceInEmptyBraces, + FormatStyle::SIEB_Never); + // For backward compatibility: + CHECK_PARSE("SpaceInEmptyBlock: true", SpaceInEmptyBraces, + FormatStyle::SIEB_Block); + // For backward compatibility: Style.SpacesInParens = FormatStyle::SIPO_Never; Style.SpacesInParensOptions = {}; diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 96cc650f52a5..83c664c3b81f 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -4050,6 +4050,10 @@ TEST_F(FormatTest, FormatsBitfields) { " uchar : 8;\n" " uchar other;\n" "};"); + verifyFormat("struct foo {\n" + " uint8_t i_am_a_bit_field_this_long\n" + " : struct_with_constexpr::i_am_a_constexpr_lengthhhhh;\n" + "};"); FormatStyle Style = getLLVMStyle(); Style.BitFieldColonSpacing = FormatStyle::BFCS_None; verifyFormat("struct Bitfields {\n" @@ -7055,7 +7059,7 @@ TEST_F(FormatTest, PutEmptyBlocksIntoOneLine) { verifyFormat("enum E {};"); verifyFormat("enum E {}"); FormatStyle Style = getLLVMStyle(); - Style.SpaceInEmptyBlock = true; + Style.SpaceInEmptyBraces = FormatStyle::SIEB_Block; verifyFormat("void f() { }", "void f() {}", Style); Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Empty; verifyFormat("{ }", Style); @@ -7083,7 +7087,7 @@ TEST_F(FormatTest, PutEmptyBlocksIntoOneLine) { Style); Style = getLLVMStyle(FormatStyle::LK_CSharp); - Style.SpaceInEmptyBlock = true; + Style.SpaceInEmptyBraces = FormatStyle::SIEB_Block; verifyFormat("Event += () => { };", Style); } @@ -8614,7 +8618,7 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) { verifyFormat("extern \"C\" //\n" " void f();"); - FormatStyle Style = getLLVMStyle(); + auto Style = getLLVMStyle(); Style.PointerAlignment = FormatStyle::PAS_Left; verifyFormat("void aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa(\n" " aaaaaaaaaaaaaaaaaaaaaaaaa* const aaaaaaaaaaaa) {}", @@ -8622,6 +8626,14 @@ TEST_F(FormatTest, BreaksFunctionDeclarations) { verifyFormat("void aaaaaaa(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa*\n" " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa) {}", Style); + + Style = getLLVMStyleWithColumns(45); + Style.PenaltyReturnTypeOnItsOwnLine = 400; + verifyFormat("template \n" + "static inline std::pair\n" + "myfunc(const char *buf, const char *&err);", + Style); } TEST_F(FormatTest, DontBreakBeforeQualifiedOperator) { @@ -25584,6 +25596,30 @@ TEST_F(FormatTest, SpacesInConditionalStatement) { verifyFormat("MYIF( a )\n return;\nelse\n return;", Spaces); } +TEST_F(FormatTest, SpaceInEmptyBraces) { + constexpr StringRef Code("void f() {}\n" + "class Unit {};\n" + "auto a = [] {};\n" + "int x{};"); + verifyFormat(Code); + + auto Style = getWebKitStyle(); + EXPECT_EQ(Style.SpaceInEmptyBraces, FormatStyle::SIEB_Always); + + verifyFormat("void f() { }\n" + "class Unit { };\n" + "auto a = [] { };\n" + "int x { };", + Code, Style); + + Style.SpaceInEmptyBraces = FormatStyle::SIEB_Block; + verifyFormat("void f() { }\n" + "class Unit { };\n" + "auto a = [] { };\n" + "int x {};", + Code, Style); +} + TEST_F(FormatTest, AlternativeOperators) { // Test case for ensuring alternate operators are not // combined with their right most neighbour. diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp index 7f99655b1fa4..85ccba38ac8c 100644 --- a/clang/unittests/Format/TokenAnnotatorTest.cpp +++ b/clang/unittests/Format/TokenAnnotatorTest.cpp @@ -618,6 +618,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) { EXPECT_TOKEN(Tokens[19], tok::l_brace, TT_StructLBrace); EXPECT_TOKEN(Tokens[20], tok::r_brace, TT_StructRBrace); + Tokens = annotate("class Outer {\n" + " struct Inner final : Base {};\n" + "};"); + ASSERT_EQ(Tokens.size(), 14u) << Tokens; + EXPECT_TOKEN(Tokens[5], tok::identifier, TT_Unknown); // Not TT_StartOfName + EXPECT_TOKEN(Tokens[6], tok::colon, TT_InheritanceColon); + constexpr StringRef Code("struct EXPORT StructName {};"); Tokens = annotate(Code); diff --git a/clang/unittests/Lex/CMakeLists.txt b/clang/unittests/Lex/CMakeLists.txt index 96ca6dda9cd8..fa5e58f5a893 100644 --- a/clang/unittests/Lex/CMakeLists.txt +++ b/clang/unittests/Lex/CMakeLists.txt @@ -5,6 +5,7 @@ add_clang_unittest(LexTests LexerTest.cpp LexHLSLRootSignatureTest.cpp ModuleDeclStateTest.cpp + NoTrivialPPDirectiveTracerTest.cpp PPCallbacksTest.cpp PPConditionalDirectiveRecordTest.cpp PPDependencyDirectivesTest.cpp diff --git a/clang/unittests/Lex/LexerTest.cpp b/clang/unittests/Lex/LexerTest.cpp index 56d73cec1363..c51cd0d2bfda 100644 --- a/clang/unittests/Lex/LexerTest.cpp +++ b/clang/unittests/Lex/LexerTest.cpp @@ -795,7 +795,7 @@ TEST_F(LexerTest, CheckFirstPPToken) { EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok, PP->getSourceManager(), PP->getLangOpts(), /*IgnoreWhiteSpace=*/false)); - EXPECT_TRUE(Tok.isFirstPPToken()); + EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc() == Tok.getLocation()); EXPECT_TRUE(Tok.is(tok::hash)); } @@ -811,7 +811,7 @@ TEST_F(LexerTest, CheckFirstPPToken) { EXPECT_FALSE(Lexer::getRawToken(PP->getMainFileFirstPPTokenLoc(), Tok, PP->getSourceManager(), PP->getLangOpts(), /*IgnoreWhiteSpace=*/false)); - EXPECT_TRUE(Tok.isFirstPPToken()); + EXPECT_TRUE(PP->getMainFileFirstPPTokenLoc() == Tok.getLocation()); EXPECT_TRUE(Tok.is(tok::raw_identifier)); EXPECT_TRUE(Tok.getRawIdentifier() == "FOO"); } diff --git a/clang/unittests/Lex/ModuleDeclStateTest.cpp b/clang/unittests/Lex/ModuleDeclStateTest.cpp index adc6cf1d2e59..ac2ddfaf52cd 100644 --- a/clang/unittests/Lex/ModuleDeclStateTest.cpp +++ b/clang/unittests/Lex/ModuleDeclStateTest.cpp @@ -61,14 +61,15 @@ protected: Target = TargetInfo::CreateTargetInfo(Diags, *TargetOpts); } - std::unique_ptr - getPreprocessor(const char *source, Language Lang) { + std::unique_ptr getPreprocessor(const char *source, + Language Lang) { std::unique_ptr Buf = llvm::MemoryBuffer::getMemBuffer(source); SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); std::vector Includes; - LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes, LangStandard::lang_cxx20); + LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes, + LangStandard::lang_cxx20); LangOpts.CPlusPlusModules = true; if (Lang != Language::CXX) { LangOpts.Modules = true; @@ -112,12 +113,11 @@ export module foo; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_TRUE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -131,12 +131,11 @@ module foo; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_FALSE(PP->isInNamedInterfaceUnit()); EXPECT_TRUE(PP->isInImplementationUnit()); @@ -150,12 +149,11 @@ module foo:part; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_FALSE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -169,12 +167,11 @@ export module foo:part; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_TRUE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -188,12 +185,11 @@ export module foo.dot:part.dot; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_TRUE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -207,12 +203,11 @@ TEST_F(ModuleDeclStateTest, NotModule) { std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)0); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)0); EXPECT_FALSE(PP->isInNamedModule()); EXPECT_FALSE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -233,12 +228,11 @@ import :another; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {true, true}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)2); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)2); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_TRUE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -260,12 +254,11 @@ import :another; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {true, true}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)2); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)2); EXPECT_TRUE(PP->isInNamedModule()); EXPECT_TRUE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -286,12 +279,11 @@ import :another; std::unique_ptr PP = getPreprocessor(source, Language::CXX); std::initializer_list ImportKinds = {true}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)1); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)1); EXPECT_FALSE(PP->isInNamedModule()); EXPECT_FALSE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -304,12 +296,11 @@ TEST_F(ModuleDeclStateTest, ImportAClangNamedModule) { std::unique_ptr PP = getPreprocessor(source, Language::ObjCXX); std::initializer_list ImportKinds = {false}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)1); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)1); EXPECT_FALSE(PP->isInNamedModule()); EXPECT_FALSE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); @@ -326,12 +317,11 @@ import M2; std::unique_ptr PP = getPreprocessor(source, Language::ObjCXX); std::initializer_list ImportKinds = {false, true, false, true}; - preprocess(*PP, - std::make_unique(*PP, ImportKinds)); - - auto *Callback = - static_cast(PP->getPPCallbacks()); - EXPECT_EQ(Callback->importNamedModuleNum(), (size_t)4); + auto Callback = + std::make_unique(*PP, ImportKinds); + CheckNamedModuleImportingCB *CallbackPtr = Callback.get(); + preprocess(*PP, std::move(Callback)); + EXPECT_EQ(CallbackPtr->importNamedModuleNum(), (size_t)4); EXPECT_FALSE(PP->isInNamedModule()); EXPECT_FALSE(PP->isInNamedInterfaceUnit()); EXPECT_FALSE(PP->isInImplementationUnit()); diff --git a/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp b/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp new file mode 100644 index 000000000000..d79c1428e55b --- /dev/null +++ b/clang/unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp @@ -0,0 +1,182 @@ +//===- unittests/Lex/NoTrivialPPDirectiveTracerTest.cpp -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/DiagnosticOptions.h" +#include "clang/Basic/FileManager.h" +#include "clang/Basic/LangOptions.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" +#include "clang/Basic/TargetOptions.h" +#include "clang/Lex/HeaderSearch.h" +#include "clang/Lex/HeaderSearchOptions.h" +#include "clang/Lex/ModuleLoader.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Lex/PreprocessorOptions.h" +#include "gtest/gtest.h" +#include +#include + +using namespace clang; + +namespace { +class NoTrivialPPDirectiveTracerTest : public ::testing::Test { +protected: + NoTrivialPPDirectiveTracerTest() + : VFS(llvm::makeIntrusiveRefCnt()), + FileMgr(FileMgrOpts, VFS), + Diags(DiagnosticIDs::create(), DiagOpts, new IgnoringDiagConsumer()), + SourceMgr(Diags, FileMgr), TargetOpts(new TargetOptions) { + TargetOpts->Triple = "x86_64-unknown-linux-gnu"; + Target = TargetInfo::CreateTargetInfo(Diags, *TargetOpts); + } + + void addFile(const char *source, StringRef Filename) { + VFS->addFile(Filename, 0, llvm::MemoryBuffer::getMemBuffer(source), + /*User=*/std::nullopt, + /*Group=*/std::nullopt, + llvm::sys::fs::file_type::regular_file); + } + + std::unique_ptr getPreprocessor(const char *source, + Language Lang) { + std::unique_ptr Buf = + llvm::MemoryBuffer::getMemBuffer(source); + SourceMgr.setMainFileID(SourceMgr.createFileID(std::move(Buf))); + + std::vector Includes; + LangOptions::setLangDefaults(LangOpts, Lang, Target->getTriple(), Includes, + LangStandard::lang_cxx20); + LangOpts.CPlusPlusModules = true; + if (Lang != Language::CXX) { + LangOpts.Modules = true; + LangOpts.ImplicitModules = true; + } + + HeaderInfo.emplace(HSOpts, SourceMgr, Diags, LangOpts, Target.get()); + + auto DE = FileMgr.getOptionalDirectoryRef("."); + assert(DE); + auto DL = DirectoryLookup(*DE, SrcMgr::C_User, /*isFramework=*/false); + HeaderInfo->AddSearchPath(DL, /*isAngled=*/false); + + return std::make_unique(PPOpts, Diags, LangOpts, SourceMgr, + *HeaderInfo, ModLoader, + /*IILookup=*/nullptr, + /*OwnsHeaderSearch=*/false); + } + + IntrusiveRefCntPtr VFS; + FileSystemOptions FileMgrOpts; + FileManager FileMgr; + DiagnosticOptions DiagOpts; + DiagnosticsEngine Diags; + SourceManager SourceMgr; + std::shared_ptr TargetOpts; + IntrusiveRefCntPtr Target; + LangOptions LangOpts; + TrivialModuleLoader ModLoader; + HeaderSearchOptions HSOpts; + std::optional HeaderInfo; + PreprocessorOptions PPOpts; +}; + +TEST_F(NoTrivialPPDirectiveTracerTest, TrivialDirective) { + const char *source = R"( + #line 7 + # 1 __FILE__ 1 3 + #ident "$Header:$" + #pragma comment(lib, "msvcrt.lib") + #pragma mark LLVM's world + #pragma detect_mismatch("test", "1") + #pragma clang __debug dump Test + #pragma message "test" + #pragma GCC warning "Foo" + #pragma GCC error "Foo" + #pragma gcc diagnostic push + #pragma gcc diagnostic pop + #pragma GCC diagnostic ignored "-Wframe-larger-than" + #pragma OPENCL EXTENSION __cl_clang_variadic_functions : enable + #pragma warning(push) + #pragma warning(pop) + #pragma execution_character_set(push, "UTF-8") + #pragma execution_character_set(pop) + #pragma clang assume_nonnull begin + #pragma clang assume_nonnull end + int foo; + )"; + std::unique_ptr PP = getPreprocessor(source, Language::CXX); + PP->Initialize(*Target); + PP->EnterMainSourceFile(); + Token Tok; + PP->Lex(Tok); + EXPECT_FALSE(PP->hasSeenNoTrivialPPDirective()); +} + +TEST_F(NoTrivialPPDirectiveTracerTest, IncludeDirective) { + const char *source = R"( + #include "header.h" + int foo; + )"; + const char *header = R"( + #ifndef HEADER_H + #define HEADER_H + #endif // HEADER_H + )"; + std::unique_ptr PP = getPreprocessor(source, Language::CXX); + addFile(header, "header.h"); + PP->Initialize(*Target); + PP->EnterMainSourceFile(); + Token Tok; + PP->Lex(Tok); + EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective()); +} + +TEST_F(NoTrivialPPDirectiveTracerTest, DefineDirective) { + const char *source = R"( + #define FOO + int foo; + )"; + std::unique_ptr PP = getPreprocessor(source, Language::CXX); + PP->Initialize(*Target); + PP->EnterMainSourceFile(); + Token Tok; + PP->Lex(Tok); + EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective()); +} + +TEST_F(NoTrivialPPDirectiveTracerTest, UnDefineDirective) { + const char *source = R"( + #undef FOO + int foo; + )"; + std::unique_ptr PP = getPreprocessor(source, Language::CXX); + PP->Initialize(*Target); + PP->setPredefines("#define FOO"); + PP->EnterMainSourceFile(); + Token Tok; + PP->Lex(Tok); + EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective()); +} + +TEST_F(NoTrivialPPDirectiveTracerTest, IfDefinedDirective) { + const char *source = R"( + #if defined(FOO) + #endif + int foo; + )"; + std::unique_ptr PP = getPreprocessor(source, Language::CXX); + PP->Initialize(*Target); + PP->setPredefines("#define FOO"); + PP->EnterMainSourceFile(); + Token Tok; + PP->Lex(Tok); + EXPECT_TRUE(PP->hasSeenNoTrivialPPDirective()); +} + +} // namespace diff --git a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp index 44f6b0469f38..44c0978a243b 100644 --- a/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp +++ b/clang/unittests/Parse/ParseHLSLRootSignatureTest.cpp @@ -180,7 +180,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { // First Descriptor Table with 4 elements RootElement Elem = Elements[0].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::BReg); ASSERT_EQ(std::get(Elem).Reg.Number, 0u); @@ -193,7 +193,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { Elem = Elements[1].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::SRV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::SRV); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::TReg); ASSERT_EQ(std::get(Elem).Reg.Number, 42u); @@ -205,7 +205,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { Elem = Elements[2].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::Sampler); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::SReg); ASSERT_EQ(std::get(Elem).Reg.Number, 987u); @@ -218,7 +218,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseDTClausesTest) { Elem = Elements[3].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::UAV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::UAV); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::UReg); ASSERT_EQ(std::get(Elem).Reg.Number, 4294967294u); @@ -445,7 +445,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidSamplerFlagsTest) { auto Elements = Parser.getElements(); RootElement Elem = Elements[0].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::Sampler); auto ValidSamplerFlags = llvm::dxbc::DescriptorRangeFlags::DescriptorsVolatile; ASSERT_EQ(std::get(Elem).Flags, ValidSamplerFlags); @@ -591,7 +591,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseRootDescriptorsTest) { RootElement Elem = Elements[0].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::BReg); ASSERT_EQ(std::get(Elem).Reg.Number, 0u); ASSERT_EQ(std::get(Elem).Space, 0u); @@ -602,7 +602,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseRootDescriptorsTest) { Elem = Elements[1].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::SRV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::SRV); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::TReg); ASSERT_EQ(std::get(Elem).Reg.Number, 42u); ASSERT_EQ(std::get(Elem).Space, 4u); @@ -616,7 +616,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseRootDescriptorsTest) { Elem = Elements[2].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::UAV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::UAV); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::UReg); ASSERT_EQ(std::get(Elem).Reg.Number, 34893247u); ASSERT_EQ(std::get(Elem).Space, 0u); @@ -628,7 +628,7 @@ TEST_F(ParseHLSLRootSignatureTest, ValidParseRootDescriptorsTest) { RootDescriptorFlags::DataVolatile); Elem = Elements[3].getElement(); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Reg.ViewType, RegisterType::BReg); ASSERT_EQ(std::get(Elem).Reg.Number, 0u); ASSERT_EQ(std::get(Elem).Space, 0u); @@ -696,17 +696,17 @@ TEST_F(ParseHLSLRootSignatureTest, ValidVersion10Test) { auto DefRootDescriptorFlag = llvm::dxbc::RootDescriptorFlags::DataVolatile; RootElement Elem = Elements[0].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Flags, DefRootDescriptorFlag); Elem = Elements[1].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::SRV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::SRV); ASSERT_EQ(std::get(Elem).Flags, DefRootDescriptorFlag); Elem = Elements[2].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::UAV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::UAV); ASSERT_EQ(std::get(Elem).Flags, DefRootDescriptorFlag); auto ValidNonSamplerFlags = @@ -714,22 +714,22 @@ TEST_F(ParseHLSLRootSignatureTest, ValidVersion10Test) { llvm::dxbc::DescriptorRangeFlags::DataVolatile; Elem = Elements[3].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Flags, ValidNonSamplerFlags); Elem = Elements[4].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::SRV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::SRV); ASSERT_EQ(std::get(Elem).Flags, ValidNonSamplerFlags); Elem = Elements[5].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::UAV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::UAV); ASSERT_EQ(std::get(Elem).Flags, ValidNonSamplerFlags); Elem = Elements[6].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::Sampler); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::DescriptorRangeFlags::DescriptorsVolatile); @@ -767,43 +767,43 @@ TEST_F(ParseHLSLRootSignatureTest, ValidVersion11Test) { auto Elements = Parser.getElements(); RootElement Elem = Elements[0].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::RootDescriptorFlags::DataStaticWhileSetAtExecute); Elem = Elements[1].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::SRV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::SRV); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::RootDescriptorFlags::DataStaticWhileSetAtExecute); Elem = Elements[2].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, DescriptorType::UAV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::UAV); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::RootDescriptorFlags::DataVolatile); Elem = Elements[3].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::CBuffer); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::CBuffer); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::DescriptorRangeFlags::DataStaticWhileSetAtExecute); Elem = Elements[4].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::SRV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::SRV); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::DescriptorRangeFlags::DataStaticWhileSetAtExecute); Elem = Elements[5].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::UAV); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::UAV); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::DescriptorRangeFlags::DataVolatile); Elem = Elements[6].getElement(); ASSERT_TRUE(std::holds_alternative(Elem)); - ASSERT_EQ(std::get(Elem).Type, ClauseType::Sampler); + ASSERT_EQ(std::get(Elem).Type, ResourceClass::Sampler); ASSERT_EQ(std::get(Elem).Flags, llvm::dxbc::DescriptorRangeFlags::None); diff --git a/compiler-rt/lib/gwp_asan/tests/basic.cpp b/compiler-rt/lib/gwp_asan/tests/basic.cpp index 88e7ed14a5c2..7d36a2ee1f94 100644 --- a/compiler-rt/lib/gwp_asan/tests/basic.cpp +++ b/compiler-rt/lib/gwp_asan/tests/basic.cpp @@ -65,11 +65,12 @@ TEST_F(DefaultGuardedPoolAllocator, NonPowerOfTwoAlignment) { // Added multi-page slots? You'll need to expand this test. TEST_F(DefaultGuardedPoolAllocator, TooBigForSinglePageSlots) { - EXPECT_EQ(nullptr, GPA.allocate(0x1001, 0)); - EXPECT_EQ(nullptr, GPA.allocate(0x1001, 1)); - EXPECT_EQ(nullptr, GPA.allocate(0x1001, 0x1000)); - EXPECT_EQ(nullptr, GPA.allocate(1, 0x2000)); - EXPECT_EQ(nullptr, GPA.allocate(0, 0x2000)); + size_t PageSize = sysconf(_SC_PAGESIZE); + EXPECT_EQ(nullptr, GPA.allocate(PageSize + 1, 0)); + EXPECT_EQ(nullptr, GPA.allocate(PageSize + 1, 1)); + EXPECT_EQ(nullptr, GPA.allocate(PageSize + 1, PageSize)); + EXPECT_EQ(nullptr, GPA.allocate(1, 2 * PageSize)); + EXPECT_EQ(nullptr, GPA.allocate(0, 2 * PageSize)); } TEST_F(CustomGuardedPoolAllocator, AllocAllSlots) { diff --git a/compiler-rt/lib/gwp_asan/tests/never_allocated.cpp b/compiler-rt/lib/gwp_asan/tests/never_allocated.cpp index 2f695b437986..37a4b384e4ac 100644 --- a/compiler-rt/lib/gwp_asan/tests/never_allocated.cpp +++ b/compiler-rt/lib/gwp_asan/tests/never_allocated.cpp @@ -13,8 +13,10 @@ #include "gwp_asan/tests/harness.h" TEST_P(BacktraceGuardedPoolAllocatorDeathTest, NeverAllocated) { + size_t PageSize = sysconf(_SC_PAGESIZE); + SCOPED_TRACE(""); - void *Ptr = GPA.allocate(0x1000); + void *Ptr = GPA.allocate(PageSize); GPA.deallocate(Ptr); std::string DeathNeedle = @@ -23,7 +25,7 @@ TEST_P(BacktraceGuardedPoolAllocatorDeathTest, NeverAllocated) { // Trigger a guard page in a completely different slot that's never allocated. // Previously, there was a bug that this would result in nullptr-dereference // in the posix crash handler. - char *volatile NeverAllocatedPtr = static_cast(Ptr) + 0x3000; + char *volatile NeverAllocatedPtr = static_cast(Ptr) + 3 * PageSize; if (!Recoverable) { EXPECT_DEATH(*NeverAllocatedPtr = 0, DeathNeedle); return; @@ -37,8 +39,8 @@ TEST_P(BacktraceGuardedPoolAllocatorDeathTest, NeverAllocated) { GetOutputBuffer().clear(); for (size_t i = 0; i < 100; ++i) { *NeverAllocatedPtr = 0; - *(NeverAllocatedPtr + 0x2000) = 0; - *(NeverAllocatedPtr + 0x3000) = 0; + *(NeverAllocatedPtr + 2 * PageSize) = 0; + *(NeverAllocatedPtr + 3 * PageSize) = 0; ASSERT_TRUE(GetOutputBuffer().empty()); } diff --git a/compiler-rt/lib/msan/tests/msan_test.cpp b/compiler-rt/lib/msan/tests/msan_test.cpp index d1c481483dfa..b0d8409d97ff 100644 --- a/compiler-rt/lib/msan/tests/msan_test.cpp +++ b/compiler-rt/lib/msan/tests/msan_test.cpp @@ -4271,14 +4271,39 @@ TEST(VectorSadTest, sse2_psad_bw) { } TEST(VectorMaddTest, mmx_pmadd_wd) { - V4x16 a = {Poisoned(), 1, 2, 3}; + V4x16 a = {Poisoned(0), 1, 2, 3}; V4x16 b = {100, 101, 102, 103}; V2x32 c = _mm_madd_pi16(a, b); + // Multiply step: + // {Poison * 100, 1 * 101, 2 * 102, 3 * 103} + // == {Poison, 1 * 101, 2 * 102, 3 * 103} + // Notice that for the poisoned value, we ignored the concrete zero value. + // + // Horizontal add step: + // {Poison + 1 * 101, 2 * 102 + 3 * 103} + // == {Poison, 2 * 102 + 3 * 103} EXPECT_POISONED(c[0]); EXPECT_NOT_POISONED(c[1]); EXPECT_EQ((unsigned)(2 * 102 + 3 * 103), c[1]); + + V4x16 d = {Poisoned(0), 1, 0, 3}; + V4x16 e = {100, 101, Poisoned(102), 103}; + V2x32 f = _mm_madd_pi16(d, e); + // Multiply step: + // {Poison * 100, 1 * 101, 0 * Poison, 3 * 103} + // == {Poison, 1 * 101, 0 , 3 * 103} + // Notice that 0 * Poison == 0. + // + // Horizontal add step: + // {Poison + 1 * 101, 0 + 3 * 103} + // == {Poison, 3 * 103} + + EXPECT_POISONED(f[0]); + EXPECT_NOT_POISONED(f[1]); + + EXPECT_EQ((unsigned)(3 * 103), f[1]); } TEST(VectorCmpTest, mm_cmpneq_ps) { diff --git a/compiler-rt/test/asan/TestCases/Linux/release_to_os_test.cpp b/compiler-rt/test/asan/TestCases/Linux/release_to_os_test.cpp index 3e28ffde46ab..dc3ead9e8436 100644 --- a/compiler-rt/test/asan/TestCases/Linux/release_to_os_test.cpp +++ b/compiler-rt/test/asan/TestCases/Linux/release_to_os_test.cpp @@ -6,6 +6,7 @@ // RUN: %env_asan_opts=allocator_release_to_os_interval_ms=-1 %run %t force 2>&1 | FileCheck %s --check-prefix=FORCE_RELEASE // REQUIRES: x86_64-target-arch +// REQUIRES: page-size-4096 #include #include diff --git a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py index 2778d8c995fd..bd0fabd1f26d 100644 --- a/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py +++ b/compiler-rt/test/cfi/cross-dso/lit.local.cfg.py @@ -12,3 +12,7 @@ if root.target_os not in ["Linux", "FreeBSD", "NetBSD"]: # Android O (API level 26) has support for cross-dso cfi in libdl.so. if config.android and "android-26" not in config.available_features: config.unsupported = True + +# The runtime library only supports 4K pages. +if "page-size-4096" not in config.available_features: + config.unsupported = True diff --git a/compiler-rt/test/dfsan/atomic.cpp b/compiler-rt/test/dfsan/atomic.cpp index 22ee323c752f..73e1cbd17a7c 100644 --- a/compiler-rt/test/dfsan/atomic.cpp +++ b/compiler-rt/test/dfsan/atomic.cpp @@ -1,9 +1,12 @@ -// RUN: %clangxx_dfsan %s -fno-exceptions -o %t && %run %t -// RUN: %clangxx_dfsan -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 %s -fno-exceptions -o %t && %run %t +// RUN: %clangxx_dfsan %s -fno-exceptions -D_GLIBCXX_NO_ASSERTIONS -o %t && %run %t +// RUN: %clangxx_dfsan -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 %s -fno-exceptions -D_GLIBCXX_NO_ASSERTIONS -o %t && %run %t // // Use -fno-exceptions to turn off exceptions to avoid instrumenting // __cxa_begin_catch, std::terminate and __gxx_personality_v0. // +// Use -D_GLIBCXX_NO_ASSERTIONS to avoid depending on +// std::__glibcxx_assert_fail with gcc >= 15. +// // TODO: Support builtin atomics. For example, https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html // DFSan instrumentation pass cannot identify builtin callsites yet. diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py index 8328b407dcc3..e2e815444dcf 100644 --- a/compiler-rt/test/lit.common.cfg.py +++ b/compiler-rt/test/lit.common.cfg.py @@ -965,6 +965,23 @@ if config.memprof_shadow_scale: else: config.available_features.add("memprof-shadow-scale-3") + +def target_page_size(): + try: + proc = subprocess.Popen( + f"{emulator or ''} python3", + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + ) + out, err = proc.communicate(b'import os; print(os.sysconf("SC_PAGESIZE"))') + return int(out) + except: + return 4096 + + +config.available_features.add(f"page-size-{target_page_size()}") + if config.expensive_checks: config.available_features.add("expensive_checks") diff --git a/compiler-rt/test/msan/dtls_test.c b/compiler-rt/test/msan/dtls_test.c index 3c384256147a..6daaab0ae0b8 100644 --- a/compiler-rt/test/msan/dtls_test.c +++ b/compiler-rt/test/msan/dtls_test.c @@ -11,6 +11,7 @@ // Reports use-of-uninitialized-value, not analyzed XFAIL: target={{.*netbsd.*}} + UNSUPPORTED: aarch64-target-arch */ diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp index 9d7d46b462a8..cc76804aed21 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp @@ -1,4 +1,5 @@ // RUN: %clangxx -O1 %s -o %t && %run %t +// REQUIRES: page-size-4096 // UNSUPPORTED: android // Fail on powerpc64 bots with: diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp index 0fa77200bf1c..c7a553469636 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp @@ -11,6 +11,9 @@ // FIXME: This mode uses 32bit allocator without purge. // UNSUPPORTED: hwasan-aliasing +// Page size is hardcoded below, but test still fails even if not hardcoded. +// REQUIRES: page-size-4096 + #include #include #include diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/resize_tls_dynamic.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/resize_tls_dynamic.cpp index c288e1d69baf..3e9ff924a3c4 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/resize_tls_dynamic.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/resize_tls_dynamic.cpp @@ -11,6 +11,9 @@ // FIXME: Investigate // UNSUPPORTED: target=powerpc64{{.*}} +// Fails because AArch64 uses TLSDESC instead of __tls_get_addr. +// UNSUPPORTED: aarch64-target-arch + #include #ifndef BUILD_DSO diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/tls_get_addr.c b/compiler-rt/test/sanitizer_common/TestCases/Linux/tls_get_addr.c index 0aff6039ac4e..a4a4f64ed370 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Linux/tls_get_addr.c +++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/tls_get_addr.c @@ -13,6 +13,9 @@ // FIXME: Fails for unknown reasons. // UNSUPPORTED: powerpc64le-target-arch +// Fails because AArch64 uses TLSDESC instead of __tls_get_addr. +// UNSUPPORTED: aarch64-target-arch + #ifndef BUILD_SO # include # include diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h new file mode 100644 index 000000000000..f2c76c9e8d97 --- /dev/null +++ b/flang/include/flang/Optimizer/Builder/Runtime/Coarray.h @@ -0,0 +1,41 @@ +//===-- Coarray.h -- generate Coarray intrinsics runtime calls --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H +#define FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H + +#include "flang/Lower/AbstractConverter.h" +#include "flang/Optimizer/Support/InternalNames.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +namespace fir { +class ExtendedValue; +class FirOpBuilder; +} // namespace fir + +namespace fir::runtime { + +// Get the function type for a prif subroutine with a variable number of +// arguments +#define PRIF_FUNCTYPE(...) \ + mlir::FunctionType::get(builder.getContext(), /*inputs*/ {__VA_ARGS__}, \ + /*result*/ {}) + +// Default prefix for subroutines of PRIF compiled with LLVM +#define PRIFNAME_SUB(fmt) \ + []() { \ + std::ostringstream oss; \ + oss << "prif_" << fmt; \ + return fir::NameUniquer::doProcedure({"prif"}, {}, oss.str()); \ + }() + +/// Generate Call to runtime prif_init +mlir::Value genInitCoarray(fir::FirOpBuilder &builder, mlir::Location loc); + +} // namespace fir::runtime +#endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COARRAY_H diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Main.h b/flang/include/flang/Optimizer/Builder/Runtime/Main.h index a0586deade42..d4067b367f73 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Main.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Main.h @@ -25,7 +25,7 @@ namespace fir::runtime { void genMain(fir::FirOpBuilder &builder, mlir::Location loc, const std::vector &defs, - bool initCuda = false); + bool initCuda = false, bool initCoarrayEnv = false); } #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_MAIN_H diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td b/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td index ee0b5aa9760b..0bddfd85d436 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROpBase.td @@ -95,9 +95,9 @@ def IsFortranValuePred : CPred<"::hlfir::isFortranValueType($_self)">; def AnyFortranValue : TypeConstraint; - -def AnyFortranEntity : TypeConstraint, "any Fortran value or variable type">; +def AnyFortranEntity + : Type, + "any Fortran value or variable type">; def IsFortranScalarCharacterPred : CPred<"::hlfir::isFortranScalarCharacterType($_self)">; diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td index 2f5da720fbe1..db3fb0b90464 100644 --- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td +++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td @@ -721,6 +721,28 @@ def hlfir_CShiftOp let hasVerifier = 1; } +def hlfir_EOShiftOp + : hlfir_Op< + "eoshift", [AttrSizedOperandSegments, + DeclareOpInterfaceMethods]> { + let summary = "EOSHIFT transformational intrinsic"; + let description = [{ + End-off shift of an array + }]; + + let arguments = (ins AnyFortranArrayObject:$array, + AnyFortranIntegerScalarOrArrayObject:$shift, + Optional:$boundary, Optional:$dim); + + let results = (outs hlfir_ExprType); + + let assemblyFormat = [{ + $array $shift (`boundary` $boundary^)? (`dim` $dim^)? attr-dict `:` functional-type(operands, results) + }]; + + let hasVerifier = 1; +} + def hlfir_ReshapeOp : hlfir_Op< "reshape", [AttrSizedOperandSegments, diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h index 83c936b7dcad..0b31cfea0430 100644 --- a/flang/include/flang/Optimizer/Support/Utils.h +++ b/flang/include/flang/Optimizer/Support/Utils.h @@ -27,6 +27,8 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/StringRef.h" +#include "flang/Optimizer/CodeGen/TypeConverter.h" + namespace fir { /// Return the integer value of a arith::ConstantOp. inline std::int64_t toInt(mlir::arith::ConstantOp cop) { @@ -198,6 +200,37 @@ std::optional> getComponentLowerBoundsIfNonDefault( fir::RecordType recordType, llvm::StringRef component, mlir::ModuleOp module, const mlir::SymbolTable *symbolTable = nullptr); +/// Generate a LLVM constant value of type `ity`, using the provided offset. +mlir::LLVM::ConstantOp +genConstantIndex(mlir::Location loc, mlir::Type ity, + mlir::ConversionPatternRewriter &rewriter, + std::int64_t offset); + +/// Helper function for generating the LLVM IR that computes the distance +/// in bytes between adjacent elements pointed to by a pointer +/// of type \p ptrTy. The result is returned as a value of \p idxTy integer +/// type. +mlir::Value computeElementDistance(mlir::Location loc, + mlir::Type llvmObjectType, mlir::Type idxTy, + mlir::ConversionPatternRewriter &rewriter, + const mlir::DataLayout &dataLayout); + +// Compute the alloc scale size (constant factors encoded in the array type). +// We do this for arrays without a constant interior or arrays of character with +// dynamic length arrays, since those are the only ones that get decayed to a +// pointer to the element type. +mlir::Value genAllocationScaleSize(mlir::Location loc, mlir::Type dataTy, + mlir::Type ity, + mlir::ConversionPatternRewriter &rewriter); + +/// Perform an extension or truncation as needed on an integer value. Lowering +/// to the specific target may involve some sign-extending or truncation of +/// values, particularly to fit them from abstract box types to the +/// appropriate reified structures. +mlir::Value integerCast(const fir::LLVMTypeConverter &converter, + mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + mlir::Type ty, mlir::Value val, bool fold = false); } // namespace fir #endif // FORTRAN_OPTIMIZER_SUPPORT_UTILS_H diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h index bd3ff4a70ef0..83a75b0efcb5 100644 --- a/flang/include/flang/Support/Fortran-features.h +++ b/flang/include/flang/Support/Fortran-features.h @@ -56,7 +56,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor, ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy, InaccessibleDeferredOverride, CudaWarpMatchFunction, DoConcurrentOffload, - TransferBOZ) + TransferBOZ, Coarray) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 3811a87aaf46..4719a242035e 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -1152,6 +1152,17 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args, diags.Report(diagID); } } + // -fcoarray + if (args.hasArg(clang::driver::options::OPT_fcoarray)) { + res.getFrontendOpts().features.Enable( + Fortran::common::LanguageFeature::Coarray); + const unsigned diagID = + diags.getCustomDiagID(clang::DiagnosticsEngine::Warning, + "Support for multi image Fortran features is " + "still experimental and in development."); + diags.Report(diagID); + } + return diags.getNumErrors() == numErrorsBefore; } @@ -1176,6 +1187,7 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, llvm::Triple t(res.getTargetOpts().triple); constexpr unsigned newestFullySupported = 31; + constexpr unsigned latestFinalized = 60; // By default OpenMP is set to the most recent fully supported version res.getLangOpts().OpenMPVersion = newestFullySupported; res.getFrontendOpts().features.Enable( @@ -1198,12 +1210,26 @@ static bool parseOpenMPArgs(CompilerInvocation &res, llvm::opt::ArgList &args, diags.Report(diagID) << value << arg->getAsString(args) << versions.str(); }; + auto reportFutureVersion = [&](llvm::StringRef value) { + const unsigned diagID = diags.getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "The specification for OpenMP version %0 is still under development; " + "the syntax and semantics of new features may be subject to change"); + std::string buffer; + llvm::raw_string_ostream versions(buffer); + llvm::interleaveComma(ompVersions, versions); + + diags.Report(diagID) << value; + }; + llvm::StringRef value = arg->getValue(); if (!value.getAsInteger(/*radix=*/10, version)) { if (llvm::is_contained(ompVersions, version)) { res.getLangOpts().OpenMPVersion = version; - if (version > newestFullySupported) + if (version > latestFinalized) + reportFutureVersion(value); + else if (version > newestFullySupported) diags.Report(clang::diag::warn_openmp_incomplete) << version; } else if (llvm::is_contained(oldVersions, version)) { const unsigned diagID = diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 43bdbdb4644e..ab7bf28a9e8b 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -475,7 +475,9 @@ public: fir::runtime::genMain(*builder, toLocation(), bridge.getEnvironmentDefaults(), getFoldingContext().languageFeatures().IsEnabled( - Fortran::common::LanguageFeature::CUDA)); + Fortran::common::LanguageFeature::CUDA), + getFoldingContext().languageFeatures().IsEnabled( + Fortran::common::LanguageFeature::Coarray)); }); finalizeOpenMPLowering(globalOmpRequiresSymbol); diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index e3f792ee296f..9d1c730b38ed 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -571,7 +571,8 @@ void DataSharingProcessor::collectSymbols( if (collectPreDetermined) { // Similar to implicit symbols, collect pre-determined symbols only if // they are not defined by a nested `DeclarationConstruct` - return !visitor.isSymbolDefineByNestedDeclaration(sym) && + return visitor.isSymbolDefineBy(sym, eval) && + !visitor.isSymbolDefineByNestedDeclaration(sym) && sym->test(semantics::Symbol::Flag::OmpPreDetermined); } diff --git a/flang/lib/Optimizer/Builder/CMakeLists.txt b/flang/lib/Optimizer/Builder/CMakeLists.txt index 31ae395805fa..8fb36a750d43 100644 --- a/flang/lib/Optimizer/Builder/CMakeLists.txt +++ b/flang/lib/Optimizer/Builder/CMakeLists.txt @@ -16,6 +16,7 @@ add_flang_library(FIRBuilder Runtime/Allocatable.cpp Runtime/ArrayConstructor.cpp Runtime/Assign.cpp + Runtime/Coarray.cpp Runtime/Character.cpp Runtime/Command.cpp Runtime/CUDA/Descriptor.cpp diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp index b6d692a0226c..086dd6671160 100644 --- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp +++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp @@ -416,7 +416,10 @@ hlfir::Entity hlfir::loadTrivialScalar(mlir::Location loc, entity = derefPointersAndAllocatables(loc, builder, entity); if (entity.isVariable() && entity.isScalar() && fir::isa_trivial(entity.getFortranElementType())) { - return Entity{fir::LoadOp::create(builder, loc, entity)}; + // Optional entities may be represented with !fir.box. + // We need to take the data pointer before loading the scalar. + mlir::Value base = genVariableRawAddress(loc, builder, entity); + return Entity{fir::LoadOp::create(builder, loc, base)}; } return entity; } diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 319ab1912cd3..22193f0de88a 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -2672,10 +2672,11 @@ mlir::Value IntrinsicLibrary::genAcosd(mlir::Type resultType, mlir::FunctionType::get(context, {resultType}, {args[0].getType()}); mlir::Value result = getRuntimeCallGenerator("acos", ftype)(builder, loc, {args[0]}); - llvm::APFloat pi = llvm::APFloat(llvm::numbers::pi); - mlir::Value dfactor = builder.createRealConstant( - loc, mlir::Float64Type::get(context), llvm::APFloat(180.0) / pi); - mlir::Value factor = builder.createConvert(loc, args[0].getType(), dfactor); + const llvm::fltSemantics &fltSem = + llvm::cast(resultType).getFloatSemantics(); + llvm::APFloat pi = llvm::APFloat(fltSem, llvm::numbers::pis); + mlir::Value factor = builder.createRealConstant( + loc, resultType, llvm::APFloat(fltSem, "180.0") / pi); return mlir::arith::MulFOp::create(builder, loc, result, factor); } diff --git a/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp new file mode 100644 index 000000000000..eaff6c37ecdb --- /dev/null +++ b/flang/lib/Optimizer/Builder/Runtime/Coarray.cpp @@ -0,0 +1,29 @@ +//===-- Coarray.cpp -- runtime API for coarray intrinsics -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Builder/Runtime/Coarray.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/RTBuilder.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" + +using namespace Fortran::runtime; +using namespace Fortran::semantics; + +/// Generate Call to runtime prif_init +mlir::Value fir::runtime::genInitCoarray(fir::FirOpBuilder &builder, + mlir::Location loc) { + mlir::Type i32Ty = builder.getI32Type(); + mlir::Value result = builder.createTemporary(loc, i32Ty); + mlir::FunctionType ftype = PRIF_FUNCTYPE(builder.getRefType(i32Ty)); + mlir::func::FuncOp funcOp = + builder.createFunction(loc, PRIFNAME_SUB("init"), ftype); + llvm::SmallVector args = + fir::runtime::createArguments(builder, loc, ftype, result); + builder.create(loc, funcOp, args); + return builder.create(loc, result); +} diff --git a/flang/lib/Optimizer/Builder/Runtime/Main.cpp b/flang/lib/Optimizer/Builder/Runtime/Main.cpp index d35f687167b0..d303e0ad6384 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Main.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Main.cpp @@ -10,6 +10,7 @@ #include "flang/Lower/EnvironmentDefault.h" #include "flang/Optimizer/Builder/BoxValue.h" #include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Runtime/Coarray.h" #include "flang/Optimizer/Builder/Runtime/EnvironmentDefaults.h" #include "flang/Optimizer/Builder/Runtime/RTBuilder.h" #include "flang/Optimizer/Dialect/FIROps.h" @@ -23,8 +24,8 @@ using namespace Fortran::runtime; /// Create a `int main(...)` that calls the Fortran entry point void fir::runtime::genMain( fir::FirOpBuilder &builder, mlir::Location loc, - const std::vector &defs, - bool initCuda) { + const std::vector &defs, bool initCuda, + bool initCoarrayEnv) { auto *context = builder.getContext(); auto argcTy = builder.getDefaultIntegerType(); auto ptrTy = mlir::LLVM::LLVMPointerType::get(context); @@ -69,6 +70,8 @@ void fir::runtime::genMain( loc, RTNAME_STRING(CUFInit), mlir::FunctionType::get(context, {}, {})); fir::CallOp::create(builder, loc, initFn); } + if (initCoarrayEnv) + fir::runtime::genInitCoarray(builder, loc); fir::CallOp::create(builder, loc, qqMainFn); fir::CallOp::create(builder, loc, stopFn); diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index ba5fef97c83e..76f3cbd421cb 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -87,14 +87,6 @@ static inline mlir::Type getI8Type(mlir::MLIRContext *context) { return mlir::IntegerType::get(context, 8); } -static mlir::LLVM::ConstantOp -genConstantIndex(mlir::Location loc, mlir::Type ity, - mlir::ConversionPatternRewriter &rewriter, - std::int64_t offset) { - auto cattr = rewriter.getI64IntegerAttr(offset); - return mlir::LLVM::ConstantOp::create(rewriter, loc, ity, cattr); -} - static mlir::Block *createBlock(mlir::ConversionPatternRewriter &rewriter, mlir::Block *insertBefore) { assert(insertBefore && "expected valid insertion block"); @@ -208,39 +200,6 @@ getDependentTypeMemSizeFn(fir::RecordType recTy, fir::AllocaOp op, TODO(op.getLoc(), "did not find allocation function"); } -// Compute the alloc scale size (constant factors encoded in the array type). -// We do this for arrays without a constant interior or arrays of character with -// dynamic length arrays, since those are the only ones that get decayed to a -// pointer to the element type. -template -static mlir::Value -genAllocationScaleSize(OP op, mlir::Type ity, - mlir::ConversionPatternRewriter &rewriter) { - mlir::Location loc = op.getLoc(); - mlir::Type dataTy = op.getInType(); - auto seqTy = mlir::dyn_cast(dataTy); - fir::SequenceType::Extent constSize = 1; - if (seqTy) { - int constRows = seqTy.getConstantRows(); - const fir::SequenceType::ShapeRef &shape = seqTy.getShape(); - if (constRows != static_cast(shape.size())) { - for (auto extent : shape) { - if (constRows-- > 0) - continue; - if (extent != fir::SequenceType::getUnknownExtent()) - constSize *= extent; - } - } - } - - if (constSize != 1) { - mlir::Value constVal{ - genConstantIndex(loc, ity, rewriter, constSize).getResult()}; - return constVal; - } - return nullptr; -} - namespace { struct DeclareOpConversion : public fir::FIROpConversion { public: @@ -275,7 +234,7 @@ struct AllocaOpConversion : public fir::FIROpConversion { auto loc = alloc.getLoc(); mlir::Type ity = lowerTy().indexType(); unsigned i = 0; - mlir::Value size = genConstantIndex(loc, ity, rewriter, 1).getResult(); + mlir::Value size = fir::genConstantIndex(loc, ity, rewriter, 1).getResult(); mlir::Type firObjType = fir::unwrapRefType(alloc.getType()); mlir::Type llvmObjectType = convertObjectType(firObjType); if (alloc.hasLenParams()) { @@ -307,7 +266,8 @@ struct AllocaOpConversion : public fir::FIROpConversion { << scalarType << " with type parameters"; } } - if (auto scaleSize = genAllocationScaleSize(alloc, ity, rewriter)) + if (auto scaleSize = fir::genAllocationScaleSize( + alloc.getLoc(), alloc.getInType(), ity, rewriter)) size = rewriter.createOrFold(loc, ity, size, scaleSize); if (alloc.hasShapeOperands()) { @@ -484,7 +444,7 @@ struct BoxIsArrayOpConversion : public fir::FIROpConversion { auto loc = boxisarray.getLoc(); TypePair boxTyPair = getBoxTypePair(boxisarray.getVal().getType()); mlir::Value rank = getRankFromBox(loc, boxTyPair, a, rewriter); - mlir::Value c0 = genConstantIndex(loc, rank.getType(), rewriter, 0); + mlir::Value c0 = fir::genConstantIndex(loc, rank.getType(), rewriter, 0); rewriter.replaceOpWithNewOp( boxisarray, mlir::LLVM::ICmpPredicate::ne, rank, c0); return mlir::success(); @@ -820,7 +780,7 @@ struct ConvertOpConversion : public fir::FIROpConversion { // Do folding for constant inputs. if (auto constVal = fir::getIntIfConstant(op0)) { mlir::Value normVal = - genConstantIndex(loc, toTy, rewriter, *constVal ? 1 : 0); + fir::genConstantIndex(loc, toTy, rewriter, *constVal ? 1 : 0); rewriter.replaceOp(convert, normVal); return mlir::success(); } @@ -833,7 +793,7 @@ struct ConvertOpConversion : public fir::FIROpConversion { } // Compare the input with zero. - mlir::Value zero = genConstantIndex(loc, fromTy, rewriter, 0); + mlir::Value zero = fir::genConstantIndex(loc, fromTy, rewriter, 0); auto isTrue = mlir::LLVM::ICmpOp::create( rewriter, loc, mlir::LLVM::ICmpPredicate::ne, op0, zero); @@ -1082,21 +1042,6 @@ static mlir::SymbolRefAttr getMalloc(fir::AllocMemOp op, return getMallocInModule(mod, op, rewriter, indexType); } -/// Helper function for generating the LLVM IR that computes the distance -/// in bytes between adjacent elements pointed to by a pointer -/// of type \p ptrTy. The result is returned as a value of \p idxTy integer -/// type. -static mlir::Value -computeElementDistance(mlir::Location loc, mlir::Type llvmObjectType, - mlir::Type idxTy, - mlir::ConversionPatternRewriter &rewriter, - const mlir::DataLayout &dataLayout) { - llvm::TypeSize size = dataLayout.getTypeSize(llvmObjectType); - unsigned short alignment = dataLayout.getTypeABIAlignment(llvmObjectType); - std::int64_t distance = llvm::alignTo(size, alignment); - return genConstantIndex(loc, idxTy, rewriter, distance); -} - /// Return value of the stride in bytes between adjacent elements /// of LLVM type \p llTy. The result is returned as a value of /// \p idxTy integer type. @@ -1105,7 +1050,7 @@ genTypeStrideInBytes(mlir::Location loc, mlir::Type idxTy, mlir::ConversionPatternRewriter &rewriter, mlir::Type llTy, const mlir::DataLayout &dataLayout) { // Create a pointer type and use computeElementDistance(). - return computeElementDistance(loc, llTy, idxTy, rewriter, dataLayout); + return fir::computeElementDistance(loc, llTy, idxTy, rewriter, dataLayout); } namespace { @@ -1124,8 +1069,9 @@ struct AllocMemOpConversion : public fir::FIROpConversion { if (fir::isRecordWithTypeParameters(fir::unwrapSequenceType(dataTy))) TODO(loc, "fir.allocmem codegen of derived type with length parameters"); mlir::Value size = genTypeSizeInBytes(loc, ity, rewriter, llvmObjectTy); - if (auto scaleSize = genAllocationScaleSize(heap, ity, rewriter)) - size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize); + if (auto scaleSize = + fir::genAllocationScaleSize(loc, heap.getInType(), ity, rewriter)) + size = rewriter.create(loc, ity, size, scaleSize); for (mlir::Value opnd : adaptor.getOperands()) size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, integerCast(loc, rewriter, ity, opnd)); @@ -1133,8 +1079,8 @@ struct AllocMemOpConversion : public fir::FIROpConversion { // As the return value of malloc(0) is implementation defined, allocate one // byte to ensure the allocation status being true. This behavior aligns to // what the runtime has. - mlir::Value zero = genConstantIndex(loc, ity, rewriter, 0); - mlir::Value one = genConstantIndex(loc, ity, rewriter, 1); + mlir::Value zero = fir::genConstantIndex(loc, ity, rewriter, 0); + mlir::Value one = fir::genConstantIndex(loc, ity, rewriter, 1); mlir::Value cmp = mlir::LLVM::ICmpOp::create( rewriter, loc, mlir::LLVM::ICmpPredicate::sgt, size, zero); size = mlir::LLVM::SelectOp::create(rewriter, loc, cmp, size, one); @@ -1157,7 +1103,8 @@ struct AllocMemOpConversion : public fir::FIROpConversion { mlir::Value genTypeSizeInBytes(mlir::Location loc, mlir::Type idxTy, mlir::ConversionPatternRewriter &rewriter, mlir::Type llTy) const { - return computeElementDistance(loc, llTy, idxTy, rewriter, getDataLayout()); + return fir::computeElementDistance(loc, llTy, idxTy, rewriter, + getDataLayout()); } }; } // namespace @@ -1344,7 +1291,7 @@ genCUFAllocDescriptor(mlir::Location loc, mlir::Type structTy = typeConverter.convertBoxTypeAsStruct(boxTy); std::size_t boxSize = dl->getTypeSizeInBits(structTy) / 8; mlir::Value sizeInBytes = - genConstantIndex(loc, llvmIntPtrType, rewriter, boxSize); + fir::genConstantIndex(loc, llvmIntPtrType, rewriter, boxSize); llvm::SmallVector args = {sizeInBytes, sourceFile, sourceLine}; return mlir::LLVM::CallOp::create(rewriter, loc, fctTy, RTNAME_STRING(CUFAllocDescriptor), args) @@ -1599,7 +1546,7 @@ struct EmboxCommonConversion : public fir::FIROpConversion { // representation of derived types with pointer/allocatable components. // This has been seen in hashing algorithms using TRANSFER. mlir::Value zero = - genConstantIndex(loc, rewriter.getI64Type(), rewriter, 0); + fir::genConstantIndex(loc, rewriter.getI64Type(), rewriter, 0); descriptor = insertField(rewriter, loc, descriptor, {getLenParamFieldId(boxTy), 0}, zero); } @@ -1944,8 +1891,8 @@ struct XEmboxOpConversion : public EmboxCommonConversion { bool hasSlice = !xbox.getSlice().empty(); unsigned sliceOffset = xbox.getSliceOperandIndex(); mlir::Location loc = xbox.getLoc(); - mlir::Value zero = genConstantIndex(loc, i64Ty, rewriter, 0); - mlir::Value one = genConstantIndex(loc, i64Ty, rewriter, 1); + mlir::Value zero = fir::genConstantIndex(loc, i64Ty, rewriter, 0); + mlir::Value one = fir::genConstantIndex(loc, i64Ty, rewriter, 1); mlir::Value prevPtrOff = one; mlir::Type eleTy = boxTy.getEleTy(); const unsigned rank = xbox.getRank(); @@ -1994,7 +1941,7 @@ struct XEmboxOpConversion : public EmboxCommonConversion { prevDimByteStride = getCharacterByteSize(loc, rewriter, charTy, adaptor.getLenParams()); } else { - prevDimByteStride = genConstantIndex( + prevDimByteStride = fir::genConstantIndex( loc, i64Ty, rewriter, charTy.getLen() * lowerTy().characterBitsize(charTy) / 8); } @@ -2152,7 +2099,7 @@ struct XReboxOpConversion : public EmboxCommonConversion { if (auto charTy = mlir::dyn_cast(inputEleTy)) { if (charTy.hasConstantLen()) { mlir::Value len = - genConstantIndex(loc, idxTy, rewriter, charTy.getLen()); + fir::genConstantIndex(loc, idxTy, rewriter, charTy.getLen()); lenParams.emplace_back(len); } else { mlir::Value len = getElementSizeFromBox(loc, idxTy, inputBoxTyPair, @@ -2161,7 +2108,7 @@ struct XReboxOpConversion : public EmboxCommonConversion { assert(!isInGlobalOp(rewriter) && "character target in global op must have constant length"); mlir::Value width = - genConstantIndex(loc, idxTy, rewriter, charTy.getFKind()); + fir::genConstantIndex(loc, idxTy, rewriter, charTy.getFKind()); len = mlir::LLVM::SDivOp::create(rewriter, loc, idxTy, len, width); } lenParams.emplace_back(len); @@ -2215,8 +2162,9 @@ private: mlir::ConversionPatternRewriter &rewriter) const { mlir::Location loc = rebox.getLoc(); mlir::Value zero = - genConstantIndex(loc, lowerTy().indexType(), rewriter, 0); - mlir::Value one = genConstantIndex(loc, lowerTy().indexType(), rewriter, 1); + fir::genConstantIndex(loc, lowerTy().indexType(), rewriter, 0); + mlir::Value one = + fir::genConstantIndex(loc, lowerTy().indexType(), rewriter, 1); for (auto iter : llvm::enumerate(llvm::zip(extents, strides))) { mlir::Value extent = std::get<0>(iter.value()); unsigned dim = iter.index(); @@ -2249,7 +2197,7 @@ private: mlir::Location loc = rebox.getLoc(); mlir::Type byteTy = ::getI8Type(rebox.getContext()); mlir::Type idxTy = lowerTy().indexType(); - mlir::Value zero = genConstantIndex(loc, idxTy, rewriter, 0); + mlir::Value zero = fir::genConstantIndex(loc, idxTy, rewriter, 0); // Apply subcomponent and substring shift on base address. if (!rebox.getSubcomponent().empty() || !rebox.getSubstr().empty()) { // Cast to inputEleTy* so that a GEP can be used. @@ -2277,7 +2225,7 @@ private: // and strides. llvm::SmallVector slicedExtents; llvm::SmallVector slicedStrides; - mlir::Value one = genConstantIndex(loc, idxTy, rewriter, 1); + mlir::Value one = fir::genConstantIndex(loc, idxTy, rewriter, 1); const bool sliceHasOrigins = !rebox.getShift().empty(); unsigned sliceOps = rebox.getSliceOperandIndex(); unsigned shiftOps = rebox.getShiftOperandIndex(); @@ -2350,7 +2298,7 @@ private: // which may be OK if all new extents are ones, the stride does not // matter, use one. mlir::Value stride = inputStrides.empty() - ? genConstantIndex(loc, idxTy, rewriter, 1) + ? fir::genConstantIndex(loc, idxTy, rewriter, 1) : inputStrides[0]; for (unsigned i = 0; i < rebox.getShape().size(); ++i) { mlir::Value rawExtent = operands[rebox.getShapeOperandIndex() + i]; @@ -2585,9 +2533,9 @@ struct XArrayCoorOpConversion unsigned shiftOffset = coor.getShiftOperandIndex(); unsigned sliceOffset = coor.getSliceOperandIndex(); auto sliceOps = coor.getSlice().begin(); - mlir::Value one = genConstantIndex(loc, idxTy, rewriter, 1); + mlir::Value one = fir::genConstantIndex(loc, idxTy, rewriter, 1); mlir::Value prevExt = one; - mlir::Value offset = genConstantIndex(loc, idxTy, rewriter, 0); + mlir::Value offset = fir::genConstantIndex(loc, idxTy, rewriter, 0); const bool isShifted = !coor.getShift().empty(); const bool isSliced = !coor.getSlice().empty(); const bool baseIsBoxed = @@ -2918,7 +2866,7 @@ private: // of lower bound aspects. This both accounts for dynamically sized // types and non contiguous arrays. auto idxTy = lowerTy().indexType(); - mlir::Value off = genConstantIndex(loc, idxTy, rewriter, 0); + mlir::Value off = fir::genConstantIndex(loc, idxTy, rewriter, 0); unsigned arrayDim = arrTy.getDimension(); for (unsigned dim = 0; dim < arrayDim && it != end; ++dim, ++it) { mlir::Value stride = @@ -3846,7 +3794,7 @@ struct IsPresentOpConversion : public fir::FIROpConversion { ptr = mlir::LLVM::ExtractValueOp::create(rewriter, loc, ptr, 0); } mlir::LLVM::ConstantOp c0 = - genConstantIndex(isPresent.getLoc(), idxTy, rewriter, 0); + fir::genConstantIndex(isPresent.getLoc(), idxTy, rewriter, 0); auto addr = mlir::LLVM::PtrToIntOp::create(rewriter, loc, idxTy, ptr); rewriter.replaceOpWithNewOp( isPresent, mlir::LLVM::ICmpPredicate::ne, addr, c0); diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp index 37f1c9f97e1c..97912bda79b0 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp @@ -21,6 +21,7 @@ #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/Support/FatalError.h" #include "flang/Optimizer/Support/InternalNames.h" +#include "flang/Optimizer/Support/Utils.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" @@ -125,10 +126,58 @@ struct PrivateClauseOpConversion return mlir::success(); } }; + +// Convert FIR type to LLVM without turning fir.box into memory +// reference. +static mlir::Type convertObjectType(const fir::LLVMTypeConverter &converter, + mlir::Type firType) { + if (auto boxTy = mlir::dyn_cast(firType)) + return converter.convertBoxTypeAsStruct(boxTy); + return converter.convertType(firType); +} + +// FIR Op specific conversion for TargetAllocMemOp +struct TargetAllocMemOpConversion + : public OpenMPFIROpConversion { + using OpenMPFIROpConversion::OpenMPFIROpConversion; + + llvm::LogicalResult + matchAndRewrite(mlir::omp::TargetAllocMemOp allocmemOp, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + mlir::Type heapTy = allocmemOp.getAllocatedType(); + mlir::Location loc = allocmemOp.getLoc(); + auto ity = lowerTy().indexType(); + mlir::Type dataTy = fir::unwrapRefType(heapTy); + mlir::Type llvmObjectTy = convertObjectType(lowerTy(), dataTy); + if (fir::isRecordWithTypeParameters(fir::unwrapSequenceType(dataTy))) + TODO(loc, "omp.target_allocmem codegen of derived type with length " + "parameters"); + mlir::Value size = fir::computeElementDistance( + loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout()); + if (auto scaleSize = fir::genAllocationScaleSize( + loc, allocmemOp.getInType(), ity, rewriter)) + size = rewriter.create(loc, ity, size, scaleSize); + for (mlir::Value opnd : adaptor.getOperands().drop_front()) + size = rewriter.create( + loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd)); + auto mallocTyWidth = lowerTy().getIndexTypeBitwidth(); + auto mallocTy = + mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth); + if (mallocTyWidth != ity.getIntOrFloatBitWidth()) + size = integerCast(lowerTy(), loc, rewriter, mallocTy, size); + rewriter.modifyOpInPlace(allocmemOp, [&]() { + allocmemOp.setInType(rewriter.getI8Type()); + allocmemOp.getTypeparamsMutable().clear(); + allocmemOp.getTypeparamsMutable().append(size); + }); + return mlir::success(); + } +}; } // namespace void fir::populateOpenMPFIRToLLVMConversionPatterns( const LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns) { patterns.add(converter); patterns.add(converter); + patterns.add(converter); } diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index 01975f357a8d..87f9899aa787 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -107,7 +107,6 @@ static bool verifyTypeParamCount(mlir::Type inType, unsigned numParams) { } /// Parser shared by Alloca and Allocmem -/// /// operation ::= %res = (`fir.alloca` | `fir.allocmem`) $in_type /// ( `(` $typeparams `)` )? ( `,` $shape )? /// attr-dict-without-keyword diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp index ed102db69dae..93ee94a120aa 100644 --- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp +++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp @@ -1440,44 +1440,46 @@ void hlfir::MatmulTransposeOp::getEffects( } //===----------------------------------------------------------------------===// -// CShiftOp +// Array shifts: CShiftOp/EOShiftOp //===----------------------------------------------------------------------===// -llvm::LogicalResult hlfir::CShiftOp::verify() { - mlir::Value array = getArray(); +template +static llvm::LogicalResult verifyArrayShift(Op op) { + mlir::Value array = op.getArray(); fir::SequenceType arrayTy = mlir::cast( hlfir::getFortranElementOrSequenceType(array.getType())); llvm::ArrayRef inShape = arrayTy.getShape(); std::size_t arrayRank = inShape.size(); mlir::Type eleTy = arrayTy.getEleTy(); - hlfir::ExprType resultTy = mlir::cast(getResult().getType()); + hlfir::ExprType resultTy = + mlir::cast(op.getResult().getType()); llvm::ArrayRef resultShape = resultTy.getShape(); std::size_t resultRank = resultShape.size(); mlir::Type resultEleTy = resultTy.getEleTy(); - mlir::Value shift = getShift(); + mlir::Value shift = op.getShift(); mlir::Type shiftTy = hlfir::getFortranElementOrSequenceType(shift.getType()); - // TODO: turn allowCharacterLenMismatch into true. - if (auto match = areMatchingTypes(*this, eleTy, resultEleTy, - /*allowCharacterLenMismatch=*/false); + if (auto match = areMatchingTypes( + op, eleTy, resultEleTy, + /*allowCharacterLenMismatch=*/!useStrictIntrinsicVerifier); match.failed()) - return emitOpError( + return op.emitOpError( "input and output arrays should have the same element type"); if (arrayRank != resultRank) - return emitOpError("input and output arrays should have the same rank"); + return op.emitOpError("input and output arrays should have the same rank"); constexpr int64_t unknownExtent = fir::SequenceType::getUnknownExtent(); for (auto [inDim, resultDim] : llvm::zip(inShape, resultShape)) if (inDim != unknownExtent && resultDim != unknownExtent && inDim != resultDim) - return emitOpError( + return op.emitOpError( "output array's shape conflicts with the input array's shape"); int64_t dimVal = -1; - if (!getDim()) + if (!op.getDim()) dimVal = 1; - else if (auto dim = fir::getIntIfConstant(getDim())) + else if (auto dim = fir::getIntIfConstant(op.getDim())) dimVal = *dim; // The DIM argument may be statically invalid (e.g. exceed the @@ -1485,44 +1487,79 @@ llvm::LogicalResult hlfir::CShiftOp::verify() { // so avoid some checks unless useStrictIntrinsicVerifier is true. if (useStrictIntrinsicVerifier && dimVal != -1) { if (dimVal < 1) - return emitOpError("DIM must be >= 1"); + return op.emitOpError("DIM must be >= 1"); if (dimVal > static_cast(arrayRank)) - return emitOpError("DIM must be <= input array's rank"); + return op.emitOpError("DIM must be <= input array's rank"); } - if (auto shiftSeqTy = mlir::dyn_cast(shiftTy)) { - // SHIFT is an array. Verify the rank and the shape (if DIM is constant). - llvm::ArrayRef shiftShape = shiftSeqTy.getShape(); - std::size_t shiftRank = shiftShape.size(); - if (shiftRank != arrayRank - 1) - return emitOpError( - "SHIFT's rank must be 1 less than the input array's rank"); + // A helper lambda to verify the shape of the array types of + // certain operands of the array shift (e.g. the SHIFT and BOUNDARY operands). + auto verifyOperandTypeShape = [&](mlir::Type type, + llvm::Twine name) -> llvm::LogicalResult { + if (auto opndSeqTy = mlir::dyn_cast(type)) { + // The operand is an array. Verify the rank and the shape (if DIM is + // constant). + llvm::ArrayRef opndShape = opndSeqTy.getShape(); + std::size_t opndRank = opndShape.size(); + if (opndRank != arrayRank - 1) + return op.emitOpError( + name + "'s rank must be 1 less than the input array's rank"); - if (useStrictIntrinsicVerifier && dimVal != -1) { - // SHIFT's shape must be [d(1), d(2), ..., d(DIM-1), d(DIM+1), ..., d(n)], - // where [d(1), d(2), ..., d(n)] is the shape of the ARRAY. - int64_t arrayDimIdx = 0; - int64_t shiftDimIdx = 0; - for (auto shiftDim : shiftShape) { - if (arrayDimIdx == dimVal - 1) + if (useStrictIntrinsicVerifier && dimVal != -1) { + // The operand's shape must be + // [d(1), d(2), ..., d(DIM-1), d(DIM+1), ..., d(n)], + // where [d(1), d(2), ..., d(n)] is the shape of the ARRAY. + int64_t arrayDimIdx = 0; + int64_t opndDimIdx = 0; + for (auto opndDim : opndShape) { + if (arrayDimIdx == dimVal - 1) + ++arrayDimIdx; + + if (inShape[arrayDimIdx] != unknownExtent && + opndDim != unknownExtent && inShape[arrayDimIdx] != opndDim) + return op.emitOpError("SHAPE(ARRAY)(" + + llvm::Twine(arrayDimIdx + 1) + + ") must be equal to SHAPE(" + name + ")(" + + llvm::Twine(opndDimIdx + 1) + + "): " + llvm::Twine(inShape[arrayDimIdx]) + + " != " + llvm::Twine(opndDim)); ++arrayDimIdx; - - if (inShape[arrayDimIdx] != unknownExtent && - shiftDim != unknownExtent && inShape[arrayDimIdx] != shiftDim) - return emitOpError("SHAPE(ARRAY)(" + llvm::Twine(arrayDimIdx + 1) + - ") must be equal to SHAPE(SHIFT)(" + - llvm::Twine(shiftDimIdx + 1) + - "): " + llvm::Twine(inShape[arrayDimIdx]) + - " != " + llvm::Twine(shiftDim)); - ++arrayDimIdx; - ++shiftDimIdx; + ++opndDimIdx; + } } } + return mlir::success(); + }; + + if (failed(verifyOperandTypeShape(shiftTy, "SHIFT"))) + return mlir::failure(); + + if constexpr (std::is_same_v) { + if (mlir::Value boundary = op.getBoundary()) { + mlir::Type boundaryTy = + hlfir::getFortranElementOrSequenceType(boundary.getType()); + if (auto match = areMatchingTypes( + op, eleTy, hlfir::getFortranElementType(boundaryTy), + /*allowCharacterLenMismatch=*/!useStrictIntrinsicVerifier); + match.failed()) + return op.emitOpError( + "ARRAY and BOUNDARY operands must have the same element type"); + if (failed(verifyOperandTypeShape(boundaryTy, "BOUNDARY"))) + return mlir::failure(); + } } return mlir::success(); } +//===----------------------------------------------------------------------===// +// CShiftOp +//===----------------------------------------------------------------------===// + +llvm::LogicalResult hlfir::CShiftOp::verify() { + return verifyArrayShift(*this); +} + void hlfir::CShiftOp::getEffects( llvm::SmallVectorImpl< mlir::SideEffects::EffectInstance> @@ -1530,6 +1567,21 @@ void hlfir::CShiftOp::getEffects( getIntrinsicEffects(getOperation(), effects); } +//===----------------------------------------------------------------------===// +// EOShiftOp +//===----------------------------------------------------------------------===// + +llvm::LogicalResult hlfir::EOShiftOp::verify() { + return verifyArrayShift(*this); +} + +void hlfir::EOShiftOp::getEffects( + llvm::SmallVectorImpl< + mlir::SideEffects::EffectInstance> + &effects) { + getIntrinsicEffects(getOperation(), effects); +} + //===----------------------------------------------------------------------===// // ReshapeOp //===----------------------------------------------------------------------===// @@ -1543,7 +1595,8 @@ llvm::LogicalResult hlfir::ReshapeOp::verify() { hlfir::getFortranElementOrSequenceType(array.getType())); if (auto match = areMatchingTypes( *this, hlfir::getFortranElementType(resultType), - arrayType.getElementType(), /*allowCharacterLenMismatch=*/true); + arrayType.getElementType(), + /*allowCharacterLenMismatch=*/!useStrictIntrinsicVerifier); match.failed()) return emitOpError("ARRAY and the result must have the same element type"); if (hlfir::isPolymorphicType(resultType) != @@ -1565,9 +1618,9 @@ llvm::LogicalResult hlfir::ReshapeOp::verify() { if (mlir::Value pad = getPad()) { auto padArrayType = mlir::cast( hlfir::getFortranElementOrSequenceType(pad.getType())); - if (auto match = areMatchingTypes(*this, arrayType.getElementType(), - padArrayType.getElementType(), - /*allowCharacterLenMismatch=*/true); + if (auto match = areMatchingTypes( + *this, arrayType.getElementType(), padArrayType.getElementType(), + /*allowCharacterLenMismatch=*/!useStrictIntrinsicVerifier); match.failed()) return emitOpError("ARRAY and PAD must be of the same type"); } diff --git a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp index 3c29d6877e8d..e0167cc12b8a 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/LowerHLFIRIntrinsics.cpp @@ -469,33 +469,49 @@ struct MatmulTransposeOpConversion } }; -class CShiftOpConversion : public HlfirIntrinsicConversion { - using HlfirIntrinsicConversion::HlfirIntrinsicConversion; +// A converter for hlfir.cshift and hlfir.eoshift. +template +class ArrayShiftOpConversion : public HlfirIntrinsicConversion { + using HlfirIntrinsicConversion::HlfirIntrinsicConversion; + using HlfirIntrinsicConversion::lowerArguments; + using HlfirIntrinsicConversion::processReturnValue; + using typename HlfirIntrinsicConversion::IntrinsicArgument; llvm::LogicalResult - matchAndRewrite(hlfir::CShiftOp cshift, - mlir::PatternRewriter &rewriter) const override { - fir::FirOpBuilder builder{rewriter, cshift.getOperation()}; - const mlir::Location &loc = cshift->getLoc(); + matchAndRewrite(T op, mlir::PatternRewriter &rewriter) const override { + fir::FirOpBuilder builder{rewriter, op.getOperation()}; + const mlir::Location &loc = op->getLoc(); - llvm::SmallVector inArgs; - mlir::Value array = cshift.getArray(); + llvm::SmallVector inArgs; + llvm::StringRef intrinsicName{[]() { + if constexpr (std::is_same_v) + return "eoshift"; + else if constexpr (std::is_same_v) + return "cshift"; + else + llvm_unreachable("unsupported array shift"); + }()}; + + mlir::Value array = op.getArray(); inArgs.push_back({array, array.getType()}); - mlir::Value shift = cshift.getShift(); + mlir::Value shift = op.getShift(); inArgs.push_back({shift, shift.getType()}); - inArgs.push_back({cshift.getDim(), builder.getI32Type()}); + if constexpr (std::is_same_v) { + mlir::Value boundary = op.getBoundary(); + inArgs.push_back({boundary, boundary ? boundary.getType() : nullptr}); + } + inArgs.push_back({op.getDim(), builder.getI32Type()}); - auto *argLowering = fir::getIntrinsicArgumentLowering("cshift"); + auto *argLowering = fir::getIntrinsicArgumentLowering(intrinsicName); llvm::SmallVector args = - lowerArguments(cshift, inArgs, rewriter, argLowering); + lowerArguments(op, inArgs, rewriter, argLowering); - mlir::Type scalarResultType = - hlfir::getFortranElementType(cshift.getType()); + mlir::Type scalarResultType = hlfir::getFortranElementType(op.getType()); - auto [resultExv, mustBeFreed] = - fir::genIntrinsicCall(builder, loc, "cshift", scalarResultType, args); + auto [resultExv, mustBeFreed] = fir::genIntrinsicCall( + builder, loc, intrinsicName, scalarResultType, args); - processReturnValue(cshift, resultExv, mustBeFreed, builder, rewriter); + processReturnValue(op, resultExv, mustBeFreed, builder, rewriter); return mlir::success(); } }; @@ -547,7 +563,8 @@ public: AnyOpConversion, SumOpConversion, ProductOpConversion, TransposeOpConversion, CountOpConversion, DotProductOpConversion, MaxvalOpConversion, MinvalOpConversion, MinlocOpConversion, - MaxlocOpConversion, CShiftOpConversion, ReshapeOpConversion>(context); + MaxlocOpConversion, ArrayShiftOpConversion, + ArrayShiftOpConversion, ReshapeOpConversion>(context); // While conceptually this pass is performing dialect conversion, we use // pattern rewrites here instead of dialect conversion because this pass diff --git a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp index b27c3a852694..fe12f49c655b 100644 --- a/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp +++ b/flang/lib/Optimizer/HLFIR/Transforms/SimplifyHLFIRIntrinsics.cpp @@ -10,6 +10,7 @@ // into the calling function. //===----------------------------------------------------------------------===// +#include "flang/Optimizer/Builder/Character.h" #include "flang/Optimizer/Builder/Complex.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Builder/HLFIRTools.h" @@ -1269,64 +1270,91 @@ public: } }; -class CShiftConversion : public mlir::OpRewritePattern { +template +class ArrayShiftConversion : public mlir::OpRewritePattern { public: - using mlir::OpRewritePattern::OpRewritePattern; + // The implementation below only support CShiftOp and EOShiftOp. + static_assert(std::is_same_v || + std::is_same_v); + + using mlir::OpRewritePattern::OpRewritePattern; llvm::LogicalResult - matchAndRewrite(hlfir::CShiftOp cshift, - mlir::PatternRewriter &rewriter) const override { + matchAndRewrite(Op op, mlir::PatternRewriter &rewriter) const override { - hlfir::ExprType expr = mlir::dyn_cast(cshift.getType()); + hlfir::ExprType expr = mlir::dyn_cast(op.getType()); assert(expr && - "expected an expression type for the result of hlfir.cshift"); + "expected an expression type for the result of the array shift"); unsigned arrayRank = expr.getRank(); - // When it is a 1D CSHIFT, we may assume that the DIM argument + // When it is a 1D CSHIFT/EOSHIFT, we may assume that the DIM argument // (whether it is present or absent) is equal to 1, otherwise, // the program is illegal. int64_t dimVal = 1; if (arrayRank != 1) - if (mlir::Value dim = cshift.getDim()) { + if (mlir::Value dim = op.getDim()) { auto constDim = fir::getIntIfConstant(dim); if (!constDim) - return rewriter.notifyMatchFailure(cshift, - "Nonconstant DIM for CSHIFT"); + return rewriter.notifyMatchFailure( + op, "Nonconstant DIM for CSHIFT/EOSHIFT"); dimVal = *constDim; } if (dimVal <= 0 || dimVal > arrayRank) - return rewriter.notifyMatchFailure(cshift, "Invalid DIM for CSHIFT"); + return rewriter.notifyMatchFailure(op, "Invalid DIM for CSHIFT/EOSHIFT"); + + if constexpr (std::is_same_v) { + // TODO: the EOSHIFT inlining code is not ready to produce + // fir.if selecting between ARRAY and BOUNDARY (or the default + // boundary value), when they are expressions of type CHARACTER. + // This needs more work. + if (mlir::isa(expr.getEleTy())) { + if (!hlfir::Entity{op.getArray()}.isVariable()) + return rewriter.notifyMatchFailure( + op, "EOSHIFT with ARRAY being CHARACTER expression"); + if (op.getBoundary() && !hlfir::Entity{op.getBoundary()}.isVariable()) + return rewriter.notifyMatchFailure( + op, "EOSHIFT with BOUNDARY being CHARACTER expression"); + } + // TODO: selecting between ARRAY and BOUNDARY values with derived types + // need more work. + if (fir::isa_derived(expr.getEleTy())) + return rewriter.notifyMatchFailure(op, "EOSHIFT of derived type"); + } // When DIM==1 and the contiguity of the input array is not statically // known, try to exploit the fact that the leading dimension might be // contiguous. We can do this now using hlfir.eval_in_mem with // a dynamic check for the leading dimension contiguity. - // Otherwise, convert hlfir.cshift to hlfir.elemental. + // Otherwise, convert hlfir.cshift/eoshift to hlfir.elemental. // // Note that the hlfir.elemental can be inlined into other hlfir.elemental, // while hlfir.eval_in_mem prevents this, and we will end up creating // a temporary array for the result. We may need to come up with // a more sophisticated logic for picking the most efficient // representation. - hlfir::Entity array = hlfir::Entity{cshift.getArray()}; + hlfir::Entity array = hlfir::Entity{op.getArray()}; mlir::Type elementType = array.getFortranElementType(); if (dimVal == 1 && fir::isa_trivial(elementType) && - // genInMemCShift() only works for variables currently. + // genInMemArrayShift() only works for variables currently. array.isVariable()) - rewriter.replaceOp(cshift, genInMemCShift(rewriter, cshift, dimVal)); + rewriter.replaceOp(op, genInMemArrayShift(rewriter, op, dimVal)); else - rewriter.replaceOp(cshift, genElementalCShift(rewriter, cshift, dimVal)); + rewriter.replaceOp(op, genElementalArrayShift(rewriter, op, dimVal)); return mlir::success(); } private: - /// Generate MODULO(\p shiftVal, \p extent). + /// For CSHIFT, generate MODULO(\p shiftVal, \p extent). + /// For EOSHIFT, return \p shiftVal casted to \p calcType. static mlir::Value normalizeShiftValue(mlir::Location loc, fir::FirOpBuilder &builder, mlir::Value shiftVal, mlir::Value extent, mlir::Type calcType) { shiftVal = builder.createConvert(loc, calcType, shiftVal); + if constexpr (std::is_same_v) + return shiftVal; + extent = builder.createConvert(loc, calcType, extent); // Make sure that we do not divide by zero. When the dimension // has zero size, turn the extent into 1. Note that the computed @@ -1342,24 +1370,227 @@ private: return builder.createConvert(loc, calcType, shiftVal); } - /// Convert \p cshift into an hlfir.elemental using - /// the pre-computed constant \p dimVal. - static mlir::Operation *genElementalCShift(mlir::PatternRewriter &rewriter, - hlfir::CShiftOp cshift, - int64_t dimVal) { - using Fortran::common::maxRank; - hlfir::Entity shift = hlfir::Entity{cshift.getShift()}; - hlfir::Entity array = hlfir::Entity{cshift.getArray()}; + /// The indices computations for the array shifts are done using I64 type. + /// For CSHIFT, all computations do not overflow signed and unsigned I64. + /// For EOSHIFT, some computations may involve negative shift values, + /// so using no-unsigned wrap flag would be incorrect. + static void setArithOverflowFlags(Op op, fir::FirOpBuilder &builder) { + if constexpr (std::is_same_v) + builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw); + else + builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw | + mlir::arith::IntegerOverflowFlags::nuw); + } - mlir::Location loc = cshift.getLoc(); - fir::FirOpBuilder builder{rewriter, cshift.getOperation()}; + /// Return the element type of the EOSHIFT boundary that may be omitted + /// statically or dynamically. This element type might be used + /// to generate MLIR where we have to select between the default + /// boundary value and the dynamically absent/present boundary value. + /// If the boundary has a type not defined in Table 16.4 in 16.9.77 + /// of F2023, then the return value is nullptr. + static mlir::Type getDefaultBoundaryValueType(mlir::Type elementType) { + // To be able to generate a "select" between the default boundary value + // and the dynamic boundary value, use BoxCharType for the CHARACTER + // cases. This might be a little bit inefficient, because we may + // create unnecessary tuples, but it simplifies the inlining code. + if (auto charTy = mlir::dyn_cast(elementType)) + return fir::BoxCharType::get(charTy.getContext(), charTy.getFKind()); + + if (mlir::isa(elementType) || + fir::isa_integer(elementType) || fir::isa_real(elementType) || + fir::isa_complex(elementType)) + return elementType; + + return nullptr; + } + + /// Generate the default boundary value as defined in Table 16.4 in 16.9.77 + /// of F2023. + static mlir::Value genDefaultBoundary(mlir::Location loc, + fir::FirOpBuilder &builder, + mlir::Type elementType) { + assert(getDefaultBoundaryValueType(elementType) && + "default boundary value cannot be computed for the given type"); + if (mlir::isa(elementType)) { + // Create an empty CHARACTER of the same kind. The assignment + // of this empty CHARACTER into the result will add the padding + // if necessary. + fir::factory::CharacterExprHelper charHelper{builder, loc}; + mlir::Value zeroLen = builder.createIntegerConstant( + loc, builder.getCharacterLengthType(), 0); + fir::CharBoxValue emptyCharTemp = + charHelper.createCharacterTemp(elementType, zeroLen); + return charHelper.createEmbox(emptyCharTemp); + } + + return fir::factory::createZeroValue(builder, loc, elementType); + } + + /// \p entity represents the boundary operand of hlfir.eoshift. + /// This method generates a scalar boundary value fetched + /// from the boundary entity using \p indices (which may be empty, + /// if the boundary operand is scalar). + static mlir::Value loadEoshiftVal(mlir::Location loc, + fir::FirOpBuilder &builder, + hlfir::Entity entity, + mlir::ValueRange indices = {}) { + hlfir::Entity boundaryVal = + hlfir::loadElementAt(loc, builder, entity, indices); + + mlir::Type boundaryValTy = + getDefaultBoundaryValueType(entity.getFortranElementType()); + + // Boxed !fir.char with known LEN are loaded + // as raw references to !fir.char. + // We need to wrap them into the !fir.boxchar. + if (boundaryVal.isVariable() && boundaryValTy && + mlir::isa(boundaryValTy)) + return hlfir::genVariableBoxChar(loc, builder, boundaryVal); + return boundaryVal; + } + + /// This method generates a scalar boundary value for the given hlfir.eoshift + /// \p op that can be used to initialize cells of the result + /// if the scalar/array boundary operand is statically or dynamically + /// absent. The first result is the scalar boundary value. The second result + /// is a dynamic predicate indicating whether the scalar boundary value + /// should actually be used. + [[maybe_unused]] static std::pair + genScalarBoundaryForEOShift(mlir::Location loc, fir::FirOpBuilder &builder, + hlfir::EOShiftOp op) { + hlfir::Entity array{op.getArray()}; + mlir::Type elementType = array.getFortranElementType(); + + if (!op.getBoundary()) { + // Boundary operand is statically absent. + mlir::Value defaultVal = genDefaultBoundary(loc, builder, elementType); + mlir::Value boundaryIsScalarPred = builder.createBool(loc, true); + return {defaultVal, boundaryIsScalarPred}; + } + + hlfir::Entity boundary{op.getBoundary()}; + mlir::Type boundaryValTy = getDefaultBoundaryValueType(elementType); + + if (boundary.isScalar()) { + if (!boundaryValTy || !boundary.mayBeOptional()) { + // The boundary must be present. + mlir::Value boundaryVal = loadEoshiftVal(loc, builder, boundary); + mlir::Value boundaryIsScalarPred = builder.createBool(loc, true); + return {boundaryVal, boundaryIsScalarPred}; + } + + // Boundary is a scalar that may be dynamically absent. + // If boundary is not present dynamically, we must use the default + // value. + assert(mlir::isa(boundary.getType())); + mlir::Value isPresentPred = + fir::IsPresentOp::create(builder, loc, builder.getI1Type(), boundary); + mlir::Value boundaryVal = + builder + .genIfOp(loc, {boundaryValTy}, isPresentPred, + /*withElseRegion=*/true) + .genThen([&]() { + mlir::Value boundaryVal = + loadEoshiftVal(loc, builder, boundary); + fir::ResultOp::create(builder, loc, boundaryVal); + }) + .genElse([&]() { + mlir::Value defaultVal = + genDefaultBoundary(loc, builder, elementType); + fir::ResultOp::create(builder, loc, defaultVal); + }) + .getResults()[0]; + mlir::Value boundaryIsScalarPred = builder.createBool(loc, true); + return {boundaryVal, boundaryIsScalarPred}; + } + if (!boundaryValTy || !boundary.mayBeOptional()) { + // The boundary must be present + mlir::Value boundaryIsScalarPred = builder.createBool(loc, false); + return {nullptr, boundaryIsScalarPred}; + } + + // Boundary is an array that may be dynamically absent. + mlir::Value defaultVal = genDefaultBoundary(loc, builder, elementType); + mlir::Value isPresentPred = + fir::IsPresentOp::create(builder, loc, builder.getI1Type(), boundary); + // If the array is present, then boundaryIsScalarPred must be equal + // to false, otherwise, it should be true. + mlir::Value trueVal = builder.createBool(loc, true); + mlir::Value falseVal = builder.createBool(loc, false); + mlir::Value boundaryIsScalarPred = mlir::arith::SelectOp::create( + builder, loc, isPresentPred, falseVal, trueVal); + return {defaultVal, boundaryIsScalarPred}; + } + + /// Generate code that produces the final boundary value to be assigned + /// to the result of hlfir.eoshift \p op. \p precomputedScalarBoundary + /// specifies the scalar boundary value pre-computed before the elemental + /// or the assignment loop. If it is nullptr, then the boundary operand + /// of \p op must be a present array. \p boundaryIsScalarPred is a dynamic + /// predicate that is true, when the pre-computed scalar value must be used. + /// \p oneBasedIndices specify the indices to address into the boundary + /// array - they may be empty, if the boundary is scalar. + [[maybe_unused]] static mlir::Value selectBoundaryValue( + mlir::Location loc, fir::FirOpBuilder &builder, hlfir::EOShiftOp op, + mlir::Value precomputedScalarBoundary, mlir::Value boundaryIsScalarPred, + mlir::ValueRange oneBasedIndices) { + // Boundary is statically absent: a default value has been precomputed. + if (!op.getBoundary()) + return precomputedScalarBoundary; + + // Boundary is statically present and is a scalar: boundary does not depend + // upon the indices and so it has been precomputed. + hlfir::Entity boundary{op.getBoundary()}; + if (boundary.isScalar()) + return precomputedScalarBoundary; + + // Boundary is statically present and is an array: if the scalar + // boundary has not been precomputed, this means that the data type + // of the shifted values does not provide a way to compute + // the default boundary value, so the array boundary must be dynamically + // present, and we can load the boundary values from it. + bool mustBePresent = !precomputedScalarBoundary; + if (mustBePresent) + return loadEoshiftVal(loc, builder, boundary, oneBasedIndices); + + // The array boundary may be dynamically absent. + // In this case, precomputedScalarBoundary is a pre-computed scalar + // boundary value that has to be used if boundaryIsScalarPred + // is true, otherwise, the boundary value has to be loaded + // from the boundary array. + mlir::Type boundaryValTy = precomputedScalarBoundary.getType(); + mlir::Value newBoundaryVal = + builder + .genIfOp(loc, {boundaryValTy}, boundaryIsScalarPred, + /*withElseRegion=*/true) + .genThen([&]() { + fir::ResultOp::create(builder, loc, precomputedScalarBoundary); + }) + .genElse([&]() { + mlir::Value elem = + loadEoshiftVal(loc, builder, boundary, oneBasedIndices); + fir::ResultOp::create(builder, loc, elem); + }) + .getResults()[0]; + return newBoundaryVal; + } + + /// Convert \p op into an hlfir.elemental using + /// the pre-computed constant \p dimVal. + static mlir::Operation * + genElementalArrayShift(mlir::PatternRewriter &rewriter, Op op, + int64_t dimVal) { + using Fortran::common::maxRank; + hlfir::Entity shift = hlfir::Entity{op.getShift()}; + hlfir::Entity array = hlfir::Entity{op.getArray()}; + + mlir::Location loc = op.getLoc(); + fir::FirOpBuilder builder{rewriter, op.getOperation()}; // The new index computation involves MODULO, which is not implemented // for IndexType, so use I64 instead. mlir::Type calcType = builder.getI64Type(); - // All the indices arithmetic used below does not overflow - // signed and unsigned I64. - builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw | - mlir::arith::IntegerOverflowFlags::nuw); + // Set the indices arithmetic overflow flags. + setArithOverflowFlags(op, builder); mlir::Value arrayShape = hlfir::genShape(loc, builder, array); llvm::SmallVector arrayExtents = @@ -1374,6 +1605,17 @@ private: shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType); } + // The boundary operand of hlfir.eoshift may be statically or + // dynamically absent. + // In both cases, it is assumed to be a scalar with the value + // corresponding to the array element type. + // boundaryIsScalarPred is a dynamic predicate that identifies + // these cases. If boundaryIsScalarPred is dynamicaly false, + // then the boundary operand must be a present array. + mlir::Value boundaryVal, boundaryIsScalarPred; + if constexpr (std::is_same_v) + std::tie(boundaryVal, boundaryIsScalarPred) = + genScalarBoundaryForEOShift(loc, builder, op); auto genKernel = [&](mlir::Location loc, fir::FirOpBuilder &builder, mlir::ValueRange inputIndices) -> hlfir::Entity { @@ -1394,34 +1636,84 @@ private: shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType); } + if constexpr (std::is_same_v) { + llvm::SmallVector boundaryIndices{indices}; + boundaryIndices.erase(boundaryIndices.begin() + dimVal - 1); + boundaryVal = + selectBoundaryValue(loc, builder, op, boundaryVal, + boundaryIsScalarPred, boundaryIndices); + } - // Element i of the result (1-based) is element - // 'MODULO(i + SH - 1, SIZE(ARRAY,DIM)) + 1' (1-based) of the original - // ARRAY (or its section, when ARRAY is not a vector). + if constexpr (std::is_same_v) { + // EOSHIFT: + // Element i of the result (1-based) is the element of the original + // array (or its section, when ARRAY is not a vector) with index + // (i + SH), if (1 <= i + SH <= SIZE(ARRAY,DIM)), otherwise + // it is the BOUNDARY value. + mlir::Value index = + builder.createConvert(loc, calcType, inputIndices[dimVal - 1]); + mlir::arith::IntegerOverflowFlags savedFlags = + builder.getIntegerOverflowFlags(); + builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw); + mlir::Value indexPlusShift = + mlir::arith::AddIOp::create(builder, loc, index, shiftVal); + builder.setIntegerOverflowFlags(savedFlags); + mlir::Value one = builder.createIntegerConstant(loc, calcType, 1); + mlir::Value cmp1 = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::sge, indexPlusShift, one); + mlir::Value cmp2 = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::sle, indexPlusShift, + shiftDimExtent); + mlir::Value loadFromArray = + mlir::arith::AndIOp::create(builder, loc, cmp1, cmp2); + mlir::Type boundaryValTy = boundaryVal.getType(); + mlir::Value result = + builder + .genIfOp(loc, {boundaryValTy}, loadFromArray, + /*withElseRegion=*/true) + .genThen([&]() { + indices[dimVal - 1] = builder.createConvert( + loc, builder.getIndexType(), indexPlusShift); + ; + mlir::Value elem = + loadEoshiftVal(loc, builder, array, indices); + fir::ResultOp::create(builder, loc, elem); + }) + .genElse( + [&]() { fir::ResultOp::create(builder, loc, boundaryVal); }) + .getResults()[0]; + return hlfir::Entity{result}; + } else { + // CSHIFT: + // Element i of the result (1-based) is element + // 'MODULO(i + SH - 1, SIZE(ARRAY,DIM)) + 1' (1-based) of the original + // ARRAY (or its section, when ARRAY is not a vector). - // Compute the index into the original array using the normalized - // shift value, which satisfies (SH >= 0 && SH < SIZE(ARRAY,DIM)): - // newIndex = - // i + ((i <= SIZE(ARRAY,DIM) - SH) ? SH : SH - SIZE(ARRAY,DIM)) - // - // Such index computation allows for further loop vectorization - // in LLVM. - mlir::Value wrapBound = - mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal); - mlir::Value adjustedShiftVal = - mlir::arith::SubIOp::create(builder, loc, shiftVal, shiftDimExtent); - mlir::Value index = - builder.createConvert(loc, calcType, inputIndices[dimVal - 1]); - mlir::Value wrapCheck = mlir::arith::CmpIOp::create( - builder, loc, mlir::arith::CmpIPredicate::sle, index, wrapBound); - mlir::Value actualShift = mlir::arith::SelectOp::create( - builder, loc, wrapCheck, shiftVal, adjustedShiftVal); - mlir::Value newIndex = - mlir::arith::AddIOp::create(builder, loc, index, actualShift); - newIndex = builder.createConvert(loc, builder.getIndexType(), newIndex); - indices[dimVal - 1] = newIndex; - hlfir::Entity element = hlfir::getElementAt(loc, builder, array, indices); - return hlfir::loadTrivialScalar(loc, builder, element); + // Compute the index into the original array using the normalized + // shift value, which satisfies (SH >= 0 && SH < SIZE(ARRAY,DIM)): + // newIndex = + // i + ((i <= SIZE(ARRAY,DIM) - SH) ? SH : SH - SIZE(ARRAY,DIM)) + // + // Such index computation allows for further loop vectorization + // in LLVM. + mlir::Value wrapBound = + mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal); + mlir::Value adjustedShiftVal = + mlir::arith::SubIOp::create(builder, loc, shiftVal, shiftDimExtent); + mlir::Value index = + builder.createConvert(loc, calcType, inputIndices[dimVal - 1]); + mlir::Value wrapCheck = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::sle, index, wrapBound); + mlir::Value actualShift = mlir::arith::SelectOp::create( + builder, loc, wrapCheck, shiftVal, adjustedShiftVal); + mlir::Value newIndex = + mlir::arith::AddIOp::create(builder, loc, index, actualShift); + newIndex = builder.createConvert(loc, builder.getIndexType(), newIndex); + indices[dimVal - 1] = newIndex; + hlfir::Entity element = + hlfir::getElementAt(loc, builder, array, indices); + return hlfir::loadTrivialScalar(loc, builder, element); + } }; mlir::Type elementType = array.getFortranElementType(); @@ -1429,19 +1721,42 @@ private: loc, builder, elementType, arrayShape, typeParams, genKernel, /*isUnordered=*/true, array.isPolymorphic() ? static_cast(array) : nullptr, - cshift.getResult().getType()); + op.getResult().getType()); return elementalOp.getOperation(); } - /// Convert \p cshift into an hlfir.eval_in_mem using the pre-computed + /// Convert \p op into an hlfir.eval_in_mem using the pre-computed /// constant \p dimVal. - /// The converted code looks like this: - /// do i=1,SH - /// result(i + (SIZE(ARRAY,DIM) - SH)) = array(i) + /// The converted code for CSHIFT looks like this: + /// DEST_OFFSET = SIZE(ARRAY,DIM) - SH + /// COPY_END1 = SH + /// do i=1,COPY_END1 + /// result(i + DEST_OFFSET) = array(i) /// end - /// do i=1,SIZE(ARRAY,DIM) - SH - /// result(i) = array(i + SH) + /// SOURCE_OFFSET = SH + /// COPY_END2 = SIZE(ARRAY,DIM) - SH + /// do i=1,COPY_END2 + /// result(i) = array(i + SOURCE_OFFSET) /// end + /// Where SH is the normalized shift value, which satisfies + /// (SH >= 0 && SH < SIZE(ARRAY,DIM)). + /// + /// The converted code for EOSHIFT looks like this: + /// EXTENT = SIZE(ARRAY,DIM) + /// DEST_OFFSET = SH < 0 ? -SH : 0 + /// SOURCE_OFFSET = SH < 0 ? 0 : SH + /// COPY_END = SH < 0 ? + /// (-EXTENT > SH ? 0 : EXTENT + SH) : + /// (EXTENT < SH ? 0 : EXTENT - SH) + /// do i=1,COPY_END + /// result(i + DEST_OFFSET) = array(i + SOURCE_OFFSET) + /// end + /// INIT_END = EXTENT - COPY_END + /// INIT_OFFSET = SH < 0 ? 0 : COPY_END + /// do i=1,INIT_END + /// result(i + INIT_OFFSET) = BOUNDARY + /// end + /// Where SH is the original shift value. /// /// When \p dimVal is 1, we generate the same code twice /// under a dynamic check for the contiguity of the leading @@ -1450,24 +1765,21 @@ private: /// as a contiguous slice of the original array. /// This allows recognizing the above two loops as memcpy /// loop idioms in LLVM. - static mlir::Operation *genInMemCShift(mlir::PatternRewriter &rewriter, - hlfir::CShiftOp cshift, - int64_t dimVal) { + static mlir::Operation *genInMemArrayShift(mlir::PatternRewriter &rewriter, + Op op, int64_t dimVal) { using Fortran::common::maxRank; - hlfir::Entity shift = hlfir::Entity{cshift.getShift()}; - hlfir::Entity array = hlfir::Entity{cshift.getArray()}; + hlfir::Entity shift = hlfir::Entity{op.getShift()}; + hlfir::Entity array = hlfir::Entity{op.getArray()}; assert(array.isVariable() && "array must be a variable"); assert(!array.isPolymorphic() && - "genInMemCShift does not support polymorphic types"); - mlir::Location loc = cshift.getLoc(); - fir::FirOpBuilder builder{rewriter, cshift.getOperation()}; + "genInMemArrayShift does not support polymorphic types"); + mlir::Location loc = op.getLoc(); + fir::FirOpBuilder builder{rewriter, op.getOperation()}; // The new index computation involves MODULO, which is not implemented // for IndexType, so use I64 instead. mlir::Type calcType = builder.getI64Type(); - // All the indices arithmetic used below does not overflow - // signed and unsigned I64. - builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw | - mlir::arith::IntegerOverflowFlags::nuw); + // Set the indices arithmetic overflow flags. + setArithOverflowFlags(op, builder); mlir::Value arrayShape = hlfir::genShape(loc, builder, array); llvm::SmallVector arrayExtents = @@ -1482,10 +1794,20 @@ private: shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType); } + // The boundary operand of hlfir.eoshift may be statically or + // dynamically absent. + // In both cases, it is assumed to be a scalar with the value + // corresponding to the array element type. + // boundaryIsScalarPred is a dynamic predicate that identifies + // these cases. If boundaryIsScalarPred is dynamicaly false, + // then the boundary operand must be a present array. + mlir::Value boundaryVal, boundaryIsScalarPred; + if constexpr (std::is_same_v) + std::tie(boundaryVal, boundaryIsScalarPred) = + genScalarBoundaryForEOShift(loc, builder, op); hlfir::EvaluateInMemoryOp evalOp = hlfir::EvaluateInMemoryOp::create( - builder, loc, mlir::cast(cshift.getType()), - arrayShape); + builder, loc, mlir::cast(op.getType()), arrayShape); builder.setInsertionPointToStart(&evalOp.getBody().front()); mlir::Value resultArray = evalOp.getMemory(); @@ -1499,11 +1821,12 @@ private: // (if any). If exposeContiguity is true, the array's section // array(s(1), ..., s(dim-1), :, s(dim+1), ..., s(n)) is represented // as a contiguous 1D array. - // shiftVal is the normalized shift value that satisfies (SH >= 0 && SH < - // SIZE(ARRAY,DIM)). + // For CSHIFT, shiftVal is the normalized shift value that satisfies + // (SH >= 0 && SH < SIZE(ARRAY,DIM)). // auto genDimensionShift = [&](mlir::Location loc, fir::FirOpBuilder &builder, - mlir::Value shiftVal, bool exposeContiguity, + mlir::Value shiftVal, mlir::Value boundary, + bool exposeContiguity, mlir::ValueRange oneBasedIndices) -> llvm::SmallVector { // Create a vector of indices (s(1), ..., s(dim-1), nullptr, s(dim+1), @@ -1536,63 +1859,143 @@ private: srcIndices.resize(1); } - // Copy first portion of the array: - // do i=1,SH - // result(i + (SIZE(ARRAY,DIM) - SH)) = array(i) - // end - auto genAssign1 = [&](mlir::Location loc, fir::FirOpBuilder &builder, - mlir::ValueRange index, - mlir::ValueRange reductionArgs) + // genCopy labda generates the body of a generic copy loop. + // do i=1,COPY_END + // result(i + DEST_OFFSET) = array(i + SOURCE_OFFSET) + // end + // + // It is parameterized by DEST_OFFSET and SOURCE_OFFSET. + mlir::Value dstOffset, srcOffset; + auto genCopy = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange index, mlir::ValueRange reductionArgs) -> llvm::SmallVector { assert(index.size() == 1 && "expected single loop"); mlir::Value srcIndex = builder.createConvert(loc, calcType, index[0]); + mlir::Value dstIndex = srcIndex; + if (srcOffset) + srcIndex = + mlir::arith::AddIOp::create(builder, loc, srcIndex, srcOffset); srcIndices[dimVal - 1] = srcIndex; hlfir::Entity srcElementValue = hlfir::loadElementAt(loc, builder, srcArray, srcIndices); - mlir::Value dstIndex = mlir::arith::AddIOp::create( - builder, loc, srcIndex, - mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, - shiftVal)); + if (dstOffset) + dstIndex = + mlir::arith::AddIOp::create(builder, loc, dstIndex, dstOffset); dstIndices[dimVal - 1] = dstIndex; hlfir::Entity dstElement = hlfir::getElementAt( loc, builder, hlfir::Entity{resultArray}, dstIndices); hlfir::AssignOp::create(builder, loc, srcElementValue, dstElement); + // Reset the external parameters' values to make sure + // they are properly updated between the labda calls. + // WARNING: if genLoopNestWithReductions() calls the lambda + // multiple times, this is going to be a problem. + dstOffset = nullptr; + srcOffset = nullptr; return {}; }; - // Generate the first loop. - hlfir::genLoopNestWithReductions(loc, builder, {shiftVal}, - /*reductionInits=*/{}, genAssign1, - /*isUnordered=*/true); + if constexpr (std::is_same_v) { + // Copy first portion of the array: + // DEST_OFFSET = SIZE(ARRAY,DIM) - SH + // COPY_END1 = SH + // do i=1,COPY_END1 + // result(i + DEST_OFFSET) = array(i) + // end + dstOffset = + mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal); + srcOffset = nullptr; + hlfir::genLoopNestWithReductions(loc, builder, {shiftVal}, + /*reductionInits=*/{}, genCopy, + /*isUnordered=*/true); - // Copy second portion of the array: - // do i=1,SIZE(ARRAY,DIM)-SH - // result(i) = array(i + SH) - // end - auto genAssign2 = [&](mlir::Location loc, fir::FirOpBuilder &builder, - mlir::ValueRange index, - mlir::ValueRange reductionArgs) - -> llvm::SmallVector { - assert(index.size() == 1 && "expected single loop"); - mlir::Value dstIndex = builder.createConvert(loc, calcType, index[0]); - mlir::Value srcIndex = - mlir::arith::AddIOp::create(builder, loc, dstIndex, shiftVal); - srcIndices[dimVal - 1] = srcIndex; - hlfir::Entity srcElementValue = - hlfir::loadElementAt(loc, builder, srcArray, srcIndices); - dstIndices[dimVal - 1] = dstIndex; - hlfir::Entity dstElement = hlfir::getElementAt( - loc, builder, hlfir::Entity{resultArray}, dstIndices); - hlfir::AssignOp::create(builder, loc, srcElementValue, dstElement); - return {}; - }; + // Copy second portion of the array: + // SOURCE_OFFSET = SH + // COPY_END2 = SIZE(ARRAY,DIM) - SH + // do i=1,COPY_END2 + // result(i) = array(i + SOURCE_OFFSET) + // end + mlir::Value bound = + mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal); + dstOffset = nullptr; + srcOffset = shiftVal; + hlfir::genLoopNestWithReductions(loc, builder, {bound}, + /*reductionInits=*/{}, genCopy, + /*isUnordered=*/true); + } else { + // Do the copy: + // EXTENT = SIZE(ARRAY,DIM) + // DEST_OFFSET = SH < 0 ? -SH : 0 + // SOURCE_OFFSET = SH < 0 ? 0 : SH + // COPY_END = SH < 0 ? + // (-EXTENT > SH ? 0 : EXTENT + SH) : + // (EXTENT < SH ? 0 : EXTENT - SH) + // do i=1,COPY_END + // result(i + DEST_OFFSET) = array(i + SOURCE_OFFSET) + // end + mlir::arith::IntegerOverflowFlags savedFlags = + builder.getIntegerOverflowFlags(); + builder.setIntegerOverflowFlags(mlir::arith::IntegerOverflowFlags::nsw); - // Generate the second loop. - mlir::Value bound = - mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal); - hlfir::genLoopNestWithReductions(loc, builder, {bound}, - /*reductionInits=*/{}, genAssign2, - /*isUnordered=*/true); + mlir::Value zero = builder.createIntegerConstant(loc, calcType, 0); + mlir::Value isNegativeShift = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::slt, shiftVal, zero); + mlir::Value shiftNeg = + mlir::arith::SubIOp::create(builder, loc, zero, shiftVal); + dstOffset = mlir::arith::SelectOp::create(builder, loc, isNegativeShift, + shiftNeg, zero); + srcOffset = mlir::arith::SelectOp::create(builder, loc, isNegativeShift, + zero, shiftVal); + mlir::Value extentNeg = + mlir::arith::SubIOp::create(builder, loc, zero, shiftDimExtent); + mlir::Value extentPlusShift = + mlir::arith::AddIOp::create(builder, loc, shiftDimExtent, shiftVal); + mlir::Value extentNegShiftCmp = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::sgt, extentNeg, shiftVal); + mlir::Value negativeShiftBound = mlir::arith::SelectOp::create( + builder, loc, extentNegShiftCmp, zero, extentPlusShift); + mlir::Value extentMinusShift = + mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, shiftVal); + mlir::Value extentShiftCmp = mlir::arith::CmpIOp::create( + builder, loc, mlir::arith::CmpIPredicate::slt, shiftDimExtent, + shiftVal); + mlir::Value positiveShiftBound = mlir::arith::SelectOp::create( + builder, loc, extentShiftCmp, zero, extentMinusShift); + mlir::Value copyEnd = mlir::arith::SelectOp::create( + builder, loc, isNegativeShift, negativeShiftBound, + positiveShiftBound); + hlfir::genLoopNestWithReductions(loc, builder, {copyEnd}, + /*reductionInits=*/{}, genCopy, + /*isUnordered=*/true); + + // Do the init: + // INIT_END = EXTENT - COPY_END + // INIT_OFFSET = SH < 0 ? 0 : COPY_END + // do i=1,INIT_END + // result(i + INIT_OFFSET) = BOUNDARY + // end + assert(boundary && "boundary cannot be null"); + mlir::Value initEnd = + mlir::arith::SubIOp::create(builder, loc, shiftDimExtent, copyEnd); + mlir::Value initOffset = mlir::arith::SelectOp::create( + builder, loc, isNegativeShift, zero, copyEnd); + auto genInit = [&](mlir::Location loc, fir::FirOpBuilder &builder, + mlir::ValueRange index, + mlir::ValueRange reductionArgs) + -> llvm::SmallVector { + mlir::Value dstIndex = builder.createConvert(loc, calcType, index[0]); + dstIndex = + mlir::arith::AddIOp::create(builder, loc, dstIndex, initOffset); + dstIndices[dimVal - 1] = dstIndex; + hlfir::Entity dstElement = hlfir::getElementAt( + loc, builder, hlfir::Entity{resultArray}, dstIndices); + hlfir::AssignOp::create(builder, loc, boundary, dstElement); + return {}; + }; + hlfir::genLoopNestWithReductions(loc, builder, {initEnd}, + /*reductionInits=*/{}, genInit, + /*isUnordered=*/true); + builder.setIntegerOverflowFlags(savedFlags); + } return {}; }; @@ -1614,6 +2017,10 @@ private: shiftVal = normalizeShiftValue(loc, builder, shiftVal, shiftDimExtent, calcType); } + if constexpr (std::is_same_v) + boundaryVal = + selectBoundaryValue(loc, builder, op, boundaryVal, + boundaryIsScalarPred, oneBasedIndices); // If we can fetch the byte stride of the leading dimension, // and the byte size of the element, then we can generate @@ -1635,8 +2042,8 @@ private: } if (array.isSimplyContiguous() || !elemSize || !stride) { - genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/false, - oneBasedIndices); + genDimensionShift(loc, builder, shiftVal, boundaryVal, + /*exposeContiguity=*/false, oneBasedIndices); return {}; } @@ -1644,11 +2051,11 @@ private: builder, loc, mlir::arith::CmpIPredicate::eq, elemSize, stride); builder.genIfOp(loc, {}, isContiguous, /*withElseRegion=*/true) .genThen([&]() { - genDimensionShift(loc, builder, shiftVal, /*exposeContiguity=*/true, - oneBasedIndices); + genDimensionShift(loc, builder, shiftVal, boundaryVal, + /*exposeContiguity=*/true, oneBasedIndices); }) .genElse([&]() { - genDimensionShift(loc, builder, shiftVal, + genDimensionShift(loc, builder, shiftVal, boundaryVal, /*exposeContiguity=*/false, oneBasedIndices); }); @@ -2339,7 +2746,8 @@ public: mlir::RewritePatternSet patterns(context); patterns.insert(context); patterns.insert>(context); - patterns.insert(context); + patterns.insert>(context); + patterns.insert>(context); patterns.insert>(context); patterns.insert>(context); diff --git a/flang/lib/Optimizer/Support/Utils.cpp b/flang/lib/Optimizer/Support/Utils.cpp index 5d663e28336c..c71642ce4e80 100644 --- a/flang/lib/Optimizer/Support/Utils.cpp +++ b/flang/lib/Optimizer/Support/Utils.cpp @@ -50,3 +50,74 @@ std::optional> fir::getComponentLowerBoundsIfNonDefault( return componentInfo.getLowerBounds(); return std::nullopt; } + +mlir::LLVM::ConstantOp +fir::genConstantIndex(mlir::Location loc, mlir::Type ity, + mlir::ConversionPatternRewriter &rewriter, + std::int64_t offset) { + auto cattr = rewriter.getI64IntegerAttr(offset); + return rewriter.create(loc, ity, cattr); +} + +mlir::Value +fir::computeElementDistance(mlir::Location loc, mlir::Type llvmObjectType, + mlir::Type idxTy, + mlir::ConversionPatternRewriter &rewriter, + const mlir::DataLayout &dataLayout) { + llvm::TypeSize size = dataLayout.getTypeSize(llvmObjectType); + unsigned short alignment = dataLayout.getTypeABIAlignment(llvmObjectType); + std::int64_t distance = llvm::alignTo(size, alignment); + return fir::genConstantIndex(loc, idxTy, rewriter, distance); +} + +mlir::Value +fir::genAllocationScaleSize(mlir::Location loc, mlir::Type dataTy, + mlir::Type ity, + mlir::ConversionPatternRewriter &rewriter) { + auto seqTy = mlir::dyn_cast(dataTy); + fir::SequenceType::Extent constSize = 1; + if (seqTy) { + int constRows = seqTy.getConstantRows(); + const fir::SequenceType::ShapeRef &shape = seqTy.getShape(); + if (constRows != static_cast(shape.size())) { + for (auto extent : shape) { + if (constRows-- > 0) + continue; + if (extent != fir::SequenceType::getUnknownExtent()) + constSize *= extent; + } + } + } + + if (constSize != 1) { + mlir::Value constVal{ + fir::genConstantIndex(loc, ity, rewriter, constSize).getResult()}; + return constVal; + } + return nullptr; +} + +mlir::Value fir::integerCast(const fir::LLVMTypeConverter &converter, + mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + mlir::Type ty, mlir::Value val, bool fold) { + auto valTy = val.getType(); + // If the value was not yet lowered, lower its type so that it can + // be used in getPrimitiveTypeSizeInBits. + if (!mlir::isa(valTy)) + valTy = converter.convertType(valTy); + auto toSize = mlir::LLVM::getPrimitiveTypeSizeInBits(ty); + auto fromSize = mlir::LLVM::getPrimitiveTypeSizeInBits(valTy); + if (fold) { + if (toSize < fromSize) + return rewriter.createOrFold(loc, ty, val); + if (toSize > fromSize) + return rewriter.createOrFold(loc, ty, val); + } else { + if (toSize < fromSize) + return rewriter.create(loc, ty, val); + if (toSize > fromSize) + return rewriter.create(loc, ty, val); + } + return val; +} diff --git a/flang/lib/Support/Fortran-features.cpp b/flang/lib/Support/Fortran-features.cpp index 6a61149e9700..4a6fb8d75a13 100644 --- a/flang/lib/Support/Fortran-features.cpp +++ b/flang/lib/Support/Fortran-features.cpp @@ -90,6 +90,7 @@ LanguageFeatureControl::LanguageFeatureControl() { disable_.set(LanguageFeature::OldStyleParameter); // Possibly an accidental "feature" of nvfortran. disable_.set(LanguageFeature::AssumedRankPassedToNonAssumedRank); + disable_.set(LanguageFeature::Coarray); // These warnings are enabled by default, but only because they used // to be unconditional. TODO: prune this list warnLanguage_.set(LanguageFeature::ExponentMatchingKindParam); diff --git a/flang/test/Driver/fopenmp-version.F90 b/flang/test/Driver/fopenmp-version.F90 index c2866561461b..59406d3dd32c 100644 --- a/flang/test/Driver/fopenmp-version.F90 +++ b/flang/test/Driver/fopenmp-version.F90 @@ -22,4 +22,8 @@ !RUN: not %flang -c -fopenmp -fopenmp-version=29 %s 2>&1 | FileCheck --check-prefix=ERR-BAD %s -!ERR-BAD: error: '29' is not a valid OpenMP version in '-fopenmp-version=29', valid versions are 31, 40, 45, 50, 51, 52, 60 +!ERR-BAD: error: '29' is not a valid OpenMP version in '-fopenmp-version=29', valid versions are 31, 40, 45, 50, 51, 52, 60, 61 + +!RUN: %flang -c -fopenmp -fopenmp-version=61 %s 2>&1 | FileCheck --check-prefix=FUTURE %s + +!FUTURE: The specification for OpenMP version 61 is still under development; the syntax and semantics of new features may be subject to change diff --git a/flang/test/Fir/omp_target_allocmem_freemem.fir b/flang/test/Fir/omp_target_allocmem_freemem.fir new file mode 100644 index 000000000000..03eb94acb1ac --- /dev/null +++ b/flang/test/Fir/omp_target_allocmem_freemem.fir @@ -0,0 +1,294 @@ +// RUN: %flang_fc1 -emit-llvm %s -o - | FileCheck %s + +// UNSUPPORTED: system-windows +// Disabled on 32-bit targets due to the additional `trunc` opcodes required +// UNSUPPORTED: target-x86 +// UNSUPPORTED: target=sparc-{{.*}} +// UNSUPPORTED: target=sparcel-{{.*}} + +// CHECK-LABEL: define void @omp_target_allocmem_scalar_nonchar() { +// CHECK-NEXT: [[TMP1:%.*]] = call ptr @omp_target_alloc(i64 4, i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP3]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_scalar_nonchar() -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, i32 + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_scalars_nonchar() { +// CHECK-NEXT: [[TMP1:%.*]] = call ptr @omp_target_alloc(i64 400, i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP3]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_scalars_nonchar() -> () { + %device = arith.constant 0 : i32 + %0 = arith.constant 100 : index + %1 = omp.target_allocmem %device : i32, i32, %0 + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_scalar_char() { +// CHECK-NEXT: [[TMP1:%.*]] = call ptr @omp_target_alloc(i64 10, i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP3]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_scalar_char() -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.char<1,10> + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_scalar_char_kind() { +// CHECK-NEXT: [[TMP1:%.*]] = call ptr @omp_target_alloc(i64 20, i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP3]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_scalar_char_kind() -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.char<2,10> + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar( +// CHECK-SAME: i32 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 1, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP7]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_scalar_dynchar(%l : i32) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.char<1,?>(%l : i32) + omp.target_freemem %device, %1 : i32, i64 + return +} + + +// CHECK-LABEL: define void @omp_target_allocmem_scalar_dynchar_kind( +// CHECK-SAME: i32 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 2, [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 1, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP7]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_scalar_dynchar_kind(%l : i32) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.char<2,?>(%l : i32) + omp.target_freemem %device, %1 : i32, i64 + return +} + + +// CHECK-LABEL: define void @omp_target_allocmem_array_of_nonchar() { +// CHECK-NEXT: [[TMP1:%.*]] = call ptr @omp_target_alloc(i64 36, i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP3]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_array_of_nonchar() -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x3xi32> + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_array_of_char() { +// CHECK-NEXT: [[TMP1:%.*]] = call ptr @omp_target_alloc(i64 90, i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP3]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_array_of_char() -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x3x!fir.char<1,10>> + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_array_of_dynchar( +// CHECK-SAME: i32 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 9, [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 1, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP7]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_array_of_dynchar(%l: i32) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x3x!fir.char<1,?>>(%l : i32) + omp.target_freemem %device, %1 : i32, i64 + return +} + + +// CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar( +// CHECK-SAME: i64 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 12, [[TMP0]] +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP6]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_dynarray_of_nonchar(%e: index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x?xi32>, %e + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_nonchar2( +// CHECK-SAME: i64 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 4, [[TMP0]] +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 1, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP7]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_dynarray_of_nonchar2(%e: index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array, %e, %e + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char( +// CHECK-SAME: i64 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 60, [[TMP0]] +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP6]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_dynarray_of_char(%e : index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x?x!fir.char<2,10>>, %e + omp.target_freemem %device, %1 : i32, i64 + return +} + + +// CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_char2( +// CHECK-SAME: i64 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 20, [[TMP0]] +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP0]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 1, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = call ptr @omp_target_alloc(i64 [[TMP4]], i32 0) +// CHECK-NEXT: [[TMP6:%.*]] = ptrtoint ptr [[TMP5]] to i64 +// CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP7]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_dynarray_of_char2(%e : index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array>, %e, %e + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_dynchar( +// CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) { +// CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 6, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP1]] +// CHECK-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] +// CHECK-NEXT: [[TMP7:%.*]] = call ptr @omp_target_alloc(i64 [[TMP6]], i32 0) +// CHECK-NEXT: [[TMP8:%.*]] = ptrtoint ptr [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = inttoptr i64 [[TMP8]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP9]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_dynarray_of_dynchar(%l: i32, %e : index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x?x!fir.char<2,?>>(%l : i32), %e + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_dynarray_of_dynchar2( +// CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]]) { +// CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 2, [[TMP3]] +// CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP1]] +// CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], [[TMP1]] +// CHECK-NEXT: [[TMP7:%.*]] = mul i64 1, [[TMP6]] +// CHECK-NEXT: [[TMP8:%.*]] = call ptr @omp_target_alloc(i64 [[TMP7]], i32 0) +// CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP8]] to i64 +// CHECK-NEXT: [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP10]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_dynarray_of_dynchar2(%l: i32, %e : index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array>(%l : i32), %e, %e + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_array_with_holes_nonchar( +// CHECK-SAME: i64 [[TMP0:%.*]], i64 [[TMP1:%.*]]) { +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 240, [[TMP0]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP1]] +// CHECK-NEXT: [[TMP5:%.*]] = mul i64 1, [[TMP4]] +// CHECK-NEXT: [[TMP6:%.*]] = call ptr @omp_target_alloc(i64 [[TMP5]], i32 0) +// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP8]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_array_with_holes_nonchar(%0 : index, %1 : index) -> () { + %device = arith.constant 0 : i32 + %2 = omp.target_allocmem %device : i32, !fir.array<4x?x3x?x5xi32>, %0, %1 + omp.target_freemem %device, %2 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_array_with_holes_char( +// CHECK-SAME: i64 [[TMP0:%.*]]) { +// CHECK-NEXT: [[TMP2:%.*]] = mul i64 240, [[TMP0]] +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 1, [[TMP2]] +// CHECK-NEXT: [[TMP4:%.*]] = call ptr @omp_target_alloc(i64 [[TMP3]], i32 0) +// CHECK-NEXT: [[TMP5:%.*]] = ptrtoint ptr [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP5]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP6]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_array_with_holes_char(%e: index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x?x4x!fir.char<2,10>>, %e + omp.target_freemem %device, %1 : i32, i64 + return +} + +// CHECK-LABEL: define void @omp_target_allocmem_array_with_holes_dynchar( +// CHECK-SAME: i64 [[TMP0:%.*]], i64 [[TMP1:%.*]]) { +// CHECK-NEXT: [[TMP3:%.*]] = mul i64 24, [[TMP0]] +// CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP1]] +// CHECK-NEXT: [[TMP5:%.*]] = mul i64 1, [[TMP4]] +// CHECK-NEXT: [[TMP6:%.*]] = call ptr @omp_target_alloc(i64 [[TMP5]], i32 0) +// CHECK-NEXT: [[TMP7:%.*]] = ptrtoint ptr [[TMP6]] to i64 +// CHECK-NEXT: [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr +// CHECK-NEXT: call void @omp_target_free(ptr [[TMP8]], i32 0) +// CHECK-NEXT: ret void +func.func @omp_target_allocmem_array_with_holes_dynchar(%arg0: index, %arg1: index) -> () { + %device = arith.constant 0 : i32 + %1 = omp.target_allocmem %device : i32, !fir.array<3x?x4x!fir.char<2,?>>(%arg0 : index), %arg1 + omp.target_freemem %device, %1 : i32, i64 + return +} diff --git a/flang/test/HLFIR/eoshift-lowering.fir b/flang/test/HLFIR/eoshift-lowering.fir new file mode 100644 index 000000000000..7bfc3e21f052 --- /dev/null +++ b/flang/test/HLFIR/eoshift-lowering.fir @@ -0,0 +1,294 @@ +// Test hlfir.eoshift operation lowering to fir runtime call +// RUN: fir-opt %s -lower-hlfir-intrinsics | FileCheck %s + +// 1d boxed vector shift by scalar +func.func @eoshift1(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.ref {fir.bindc_name = "sh"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) + %1:2 = hlfir.declare %arg1 {uniq_name = "sh"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %2 = hlfir.eoshift %0#0 %1#0 : (!fir.box>, !fir.ref) -> !hlfir.expr + hlfir.assign %2 to %0#0 : !hlfir.expr, !fir.box> + return +} +// CHECK-LABEL: func.func @eoshift1( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "a"}, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref {fir.bindc_name = "sh"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "sh"} : (!fir.ref) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_8:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_9:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_10:.*]] = fir.embox %[[VAL_8]](%[[VAL_9]]) : (!fir.heap>, !fir.shape<1>) -> !fir.box>> +// CHECK: fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_6]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64 +// CHECK: fir.call @_FortranAEoshiftVector(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[BOUNDARY]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, i64, !fir.box, !fir.ref, i32) -> () + +// 2d boxed array shift by scalar +func.func @eoshift2(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "sh"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) + %2 = hlfir.eoshift %0#0 %arg1 : (!fir.box>, i32) -> !hlfir.expr + hlfir.assign %2 to %0#0 : !hlfir.expr, !fir.box> + return +} +// CHECK-LABEL: func.func @eoshift2( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "a"}, +// CHECK-SAME: %[[VAL_1:.*]]: i32 {fir.bindc_name = "sh"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_7:.*]] = fir.alloca i32 +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) +// CHECK: fir.store %[[VAL_1]] to %[[VAL_7]] : !fir.ref +// CHECK: %[[VAL_9:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_11:.*]] = fir.embox %[[VAL_9]](%[[VAL_10]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_11]] to %[[VAL_6]] : !fir.ref>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_7]] : (!fir.ref) -> !fir.box +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_12]] : (!fir.box) -> !fir.box +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: fir.call @_FortranAEoshift(%[[VAL_14]], %[[VAL_15]], %[[VAL_16]], %[[BOUNDARY]], %[[VAL_17]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () + +// 2d boxed array shift by boxed array +func.func @eoshift3(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !fir.box> {fir.bindc_name = "sh"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) + %1:2 = hlfir.declare %arg1 {uniq_name = "sh"} : (!fir.box>) -> (!fir.box>, !fir.box>) + %2 = hlfir.eoshift %0#0 %1#0 : (!fir.box>, !fir.box>) -> !hlfir.expr + hlfir.assign %2 to %0#0 : !hlfir.expr, !fir.box> + return +} +// CHECK-LABEL: func.func @eoshift3( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "a"}, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.box> {fir.bindc_name = "sh"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "sh"} : (!fir.box>) -> (!fir.box>, !fir.box>) +// CHECK: %[[VAL_9:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_10:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_11:.*]] = fir.embox %[[VAL_9]](%[[VAL_10]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_11]] to %[[VAL_6]] : !fir.ref>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: fir.call @_FortranAEoshift(%[[VAL_13]], %[[VAL_14]], %[[VAL_15]], %[[BOUNDARY]], %[[VAL_16]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () + +// 2d boxed array shift by array expr +func.func @eoshift4(%arg0: !fir.box> {fir.bindc_name = "a"}, %arg1: !hlfir.expr {fir.bindc_name = "sh"}) { + %0:2 = hlfir.declare %arg0 {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) + %2 = hlfir.eoshift %0#0 %arg1 : (!fir.box>, !hlfir.expr) -> !hlfir.expr + hlfir.assign %2 to %0#0 : !hlfir.expr, !fir.box> + return +} +// CHECK-LABEL: func.func @eoshift4( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.box> {fir.bindc_name = "a"}, +// CHECK-SAME: %[[VAL_1:.*]]: !hlfir.expr {fir.bindc_name = "sh"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "a"} : (!fir.box>) -> (!fir.box>, !fir.box>) +// CHECK: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<1> +// CHECK: %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_1]](%[[VAL_8]]) {adapt.valuebyref} : (!hlfir.expr, !fir.shape<1>) -> (!fir.box>, !fir.ref>, i1) +// CHECK: %[[VAL_10:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 0 : index} : (!fir.shape<1>) -> index +// CHECK: %[[VAL_11:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_13:.*]] = fir.embox %[[VAL_11]](%[[VAL_12]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_13]] to %[[VAL_6]] : !fir.ref>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_15:.*]] = fir.embox %[[VAL_9]]#1(%[[VAL_14]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_7]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_15]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: fir.call @_FortranAEoshift(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[BOUNDARY]], %[[VAL_20]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () + +// 2d array expr shift by array expr +func.func @eoshift5(%arg0: !hlfir.expr {fir.bindc_name = "a"}, %arg1: !hlfir.expr {fir.bindc_name = "sh"}) { + %2 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr + hlfir.destroy %2 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @eoshift5( +// CHECK-SAME: %[[VAL_0:.*]]: !hlfir.expr {fir.bindc_name = "a"}, +// CHECK-SAME: %[[VAL_1:.*]]: !hlfir.expr {fir.bindc_name = "sh"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_7:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// CHECK: %[[VAL_8:.*]]:3 = hlfir.associate %[[VAL_0]](%[[VAL_7]]) {adapt.valuebyref} : (!hlfir.expr, !fir.shape<2>) -> (!fir.box>, !fir.ref>, i1) +// CHECK: %[[VAL_9:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 0 : index} : (!fir.shape<2>) -> index +// CHECK: %[[VAL_10:.*]] = hlfir.get_extent %[[VAL_7]] {dim = 1 : index} : (!fir.shape<2>) -> index +// CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_9]], %[[VAL_10]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_15:.*]] = fir.embox %[[VAL_8]]#1(%[[VAL_14]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: %[[VAL_11:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<1> +// CHECK: %[[VAL_12:.*]]:3 = hlfir.associate %[[VAL_1]](%[[VAL_11]]) {adapt.valuebyref} : (!hlfir.expr, !fir.shape<1>) -> (!fir.box>, !fir.ref>, i1) +// CHECK: %[[VAL_13:.*]] = hlfir.get_extent %[[VAL_11]] {dim = 0 : index} : (!fir.shape<1>) -> index +// CHECK: %[[VAL_16:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_5]], %[[VAL_5]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]] = fir.embox %[[VAL_16]](%[[VAL_17]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_18]] to %[[VAL_6]] : !fir.ref>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_19:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_12]]#1(%[[VAL_19]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_15]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_20]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_4]] : (index) -> i32 +// CHECK: fir.call @_FortranAEoshift(%[[VAL_22]], %[[VAL_23]], %[[VAL_24]], %[[BOUNDARY]], %[[VAL_25]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () + +// 2d array expr shift by array expr with explicit dim +func.func @eoshift6(%arg0: !hlfir.expr {fir.bindc_name = "a"}, %arg1: !hlfir.expr {fir.bindc_name = "sh"}, %dim : i16) { + %2 = hlfir.eoshift %arg0 %arg1 dim %dim : (!hlfir.expr, !hlfir.expr, i16) -> !hlfir.expr + hlfir.destroy %2 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @eoshift6( +// CHECK-SAME: %[[VAL_0:.*]]: !hlfir.expr {fir.bindc_name = "a"}, +// CHECK-SAME: %[[VAL_1:.*]]: !hlfir.expr {fir.bindc_name = "sh"}, +// CHECK-SAME: %[[VAL_2:.*]]: i16) { +// CHECK: %[[VAL_3:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_7:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_8:.*]] = hlfir.shape_of %[[VAL_0]] : (!hlfir.expr) -> !fir.shape<2> +// CHECK: %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_0]](%[[VAL_8]]) {adapt.valuebyref} : (!hlfir.expr, !fir.shape<2>) -> (!fir.box>, !fir.ref>, i1) +// CHECK: %[[VAL_10:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 0 : index} : (!fir.shape<2>) -> index +// CHECK: %[[VAL_11:.*]] = hlfir.get_extent %[[VAL_8]] {dim = 1 : index} : (!fir.shape<2>) -> index +// CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_10]], %[[VAL_11]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_17:.*]] = fir.embox %[[VAL_9]]#1(%[[VAL_16]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: %[[VAL_12:.*]] = hlfir.shape_of %[[VAL_1]] : (!hlfir.expr) -> !fir.shape<1> +// CHECK: %[[VAL_13:.*]]:3 = hlfir.associate %[[VAL_1]](%[[VAL_12]]) {adapt.valuebyref} : (!hlfir.expr, !fir.shape<1>) -> (!fir.box>, !fir.ref>, i1) +// CHECK: %[[VAL_14:.*]] = hlfir.get_extent %[[VAL_12]] {dim = 0 : index} : (!fir.shape<1>) -> index +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_2]] : (i16) -> i32 +// CHECK: %[[VAL_18:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_19:.*]] = fir.shape %[[VAL_6]], %[[VAL_6]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_20:.*]] = fir.embox %[[VAL_18]](%[[VAL_19]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_20]] to %[[VAL_7]] : !fir.ref>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_21:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_22:.*]] = fir.embox %[[VAL_13]]#1(%[[VAL_21]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_7]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_17]] : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_22]] : (!fir.box>) -> !fir.box +// CHECK: fir.call @_FortranAEoshift(%[[VAL_24]], %[[VAL_25]], %[[VAL_26]], %[[BOUNDARY]], %[[VAL_15]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () + +// shift of polymorphic array +func.func @eoshift7(%arg0: !fir.ref>>>>, %arg1: !fir.ref) { + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "a"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) + %2:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "sh"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %c2_i32 = arith.constant 2 : i32 + %3 = fir.load %1#0 : !fir.ref>>>> + %4 = hlfir.eoshift %3 %c2_i32 : (!fir.class>>>, i32) -> !hlfir.expr?> + hlfir.assign %4 to %1#0 realloc : !hlfir.expr?>, !fir.ref>>>> + hlfir.destroy %4 : !hlfir.expr?> + return +} +// CHECK-LABEL: func.func @eoshift7( +// CHECK-SAME: %[[VAL_0:.*]]: !fir.ref>>>>, +// CHECK-SAME: %[[VAL_1:.*]]: !fir.ref) { +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.class>>> +// CHECK: %[[VAL_7:.*]] = fir.alloca i32 +// CHECK: %[[VAL_8:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_8]] {fortran_attrs = #fir.var_attrs, uniq_name = "a"} : (!fir.ref>>>>, !fir.dscope) -> (!fir.ref>>>>, !fir.ref>>>>) +// CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_8]] {uniq_name = "sh"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref>>>> +// CHECK: fir.store %[[VAL_5]] to %[[VAL_7]] : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.zero_bits !fir.heap>> +// CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_4]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_14:.*]] = fir.embox %[[VAL_12]](%[[VAL_13]]) source_box %[[VAL_11]] : (!fir.heap>>, !fir.shape<1>, !fir.class>>>) -> !fir.class>>> +// CHECK: fir.store %[[VAL_14]] to %[[VAL_6]] : !fir.ref>>>> +// CHECK: %[[BOUNDARY:.*]] = fir.absent !fir.box +// CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_7]] : !fir.ref +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>>>>) -> !fir.ref> +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.class>>>) -> !fir.box +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64 +// CHECK: fir.call @_FortranAEoshiftVector(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[BOUNDARY]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, i64, !fir.box, !fir.ref, i32) -> () + +// shift with the present scalar boundary and dim +func.func @_QPeoshift8(%arg0: !fir.box> {fir.bindc_name = "array"}) { + %cst = arith.constant 3.000000e+00 : f32 + %c2_i32 = arith.constant 2 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift8Earray"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) + %2 = hlfir.eoshift %1#0 %c2_i32 boundary %cst dim %c2_i32 : (!fir.box>, i32, f32, i32) -> !hlfir.expr + hlfir.assign %2 to %1#0 : !hlfir.expr, !fir.box> + hlfir.destroy %2 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPeoshift8( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_5:.*]] = arith.constant 3.000000e+00 : f32 +// CHECK: %[[VAL_6:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_7:.*]] = fir.alloca f32 +// CHECK: %[[VAL_8:.*]] = fir.alloca i32 +// CHECK: %[[VAL_9:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_10:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_9]] {uniq_name = "_QFeoshift8Earray"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +// CHECK: fir.store %[[VAL_4]] to %[[VAL_8]] : !fir.ref +// CHECK: fir.store %[[VAL_5]] to %[[VAL_7]] : !fir.ref +// CHECK: %[[VAL_11:.*]] = fir.embox %[[VAL_7]] : (!fir.ref) -> !fir.box +// CHECK: %[[VAL_12:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_14:.*]] = fir.embox %[[VAL_12]](%[[VAL_13]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_14]] to %[[VAL_6]] : !fir.ref>>> +// CHECK: %[[VAL_15:.*]] = fir.embox %[[VAL_8]] : (!fir.ref) -> !fir.box +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_6]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_10]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_15]] : (!fir.box) -> !fir.box +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_11]] : (!fir.box) -> !fir.box +// CHECK: fir.call @_FortranAEoshift(%[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_20]], %[[VAL_4]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () + +// shift with the present array boundary +func.func @_QPeoshift9(%arg0: !fir.box> {fir.bindc_name = "array"}, %arg1: !fir.box> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift9Earray"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) + %2:2 = hlfir.declare %arg1 dummy_scope %0 {uniq_name = "_QFeoshift9Eboundary"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) + %3 = hlfir.eoshift %1#0 %c2_i32 boundary %2#0 : (!fir.box>, i32, !fir.box>) -> !hlfir.expr + hlfir.assign %3 to %1#0 : !hlfir.expr, !fir.box> + hlfir.destroy %3 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPeoshift9( +// CHECK-SAME: %[[ARG0:.*]]: !fir.box> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.box> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_5:.*]] = fir.alloca !fir.box>> +// CHECK: %[[VAL_6:.*]] = fir.alloca i32 +// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift9Earray"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +// CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift9Eboundary"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +// CHECK: fir.store %[[VAL_4]] to %[[VAL_6]] : !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.zero_bits !fir.heap> +// CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_3]], %[[VAL_3]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_12:.*]] = fir.embox %[[VAL_10]](%[[VAL_11]]) : (!fir.heap>, !fir.shape<2>) -> !fir.box>> +// CHECK: fir.store %[[VAL_12]] to %[[VAL_5]] : !fir.ref>>> +// CHECK: %[[VAL_13:.*]] = fir.embox %[[VAL_6]] : (!fir.ref) -> !fir.box +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_5]] : (!fir.ref>>>) -> !fir.ref> +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_8]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_13]] : (!fir.box) -> !fir.box +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_9]]#1 : (!fir.box>) -> !fir.box +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_2]] : (index) -> i32 +// CHECK: fir.call @_FortranAEoshift(%[[VAL_15]], %[[VAL_16]], %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %{{.*}}, %{{.*}}) : (!fir.ref>, !fir.box, !fir.box, !fir.box, i32, !fir.ref, i32) -> () diff --git a/flang/test/HLFIR/invalid.fir b/flang/test/HLFIR/invalid.fir index d61efe0062e6..0f54a0250294 100644 --- a/flang/test/HLFIR/invalid.fir +++ b/flang/test/HLFIR/invalid.fir @@ -1555,3 +1555,96 @@ func.func @bad_reshape(%arg0: !hlfir.expr<1x!fir.char<1,2>>, %arg1: !hlfir.expr< %0 = hlfir.reshape %arg0 %arg1 pad %arg2 : (!hlfir.expr<1x!fir.char<1,2>>, !hlfir.expr<1xi32>, !hlfir.expr<1x!fir.char<2,?>>) -> !hlfir.expr> return } + +// ----- + +func.func @bad_eoshift1(%arg0: !hlfir.expr, %arg1: i32) { + // expected-error@+1 {{'hlfir.eoshift' op input and output arrays should have the same element type}} + %0 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr, i32) -> !hlfir.expr + return +} + +// ----- + +func.func @bad_eoshift2(%arg0: !hlfir.expr, %arg1: i32) { + // expected-error@+1 {{'hlfir.eoshift' op input and output arrays should have the same rank}} + %0 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr, i32) -> !hlfir.expr + return +} + +// ----- + +func.func @bad_eoshift3(%arg0: !hlfir.expr<2x2xi32>, %arg1: i32) { + // expected-error@+1 {{'hlfir.eoshift' op output array's shape conflicts with the input array's shape}} + %0 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr<2x2xi32>, i32) -> !hlfir.expr<2x3xi32> + return +} + +// ----- + +func.func @bad_eoshift4(%arg0: !hlfir.expr<2x2xi32>, %arg1: i32) { + %c0 = arith.constant 0 : index + // expected-error@+1 {{'hlfir.eoshift' op DIM must be >= 1}} + %0 = hlfir.eoshift %arg0 %arg1 dim %c0 : (!hlfir.expr<2x2xi32>, i32, index) -> !hlfir.expr<2x2xi32> + return +} + +// ----- + +func.func @bad_eoshift5(%arg0: !hlfir.expr<2x2xi32>, %arg1: i32) { + %c10 = arith.constant 10 : index + // expected-error@+1 {{'hlfir.eoshift' op DIM must be <= input array's rank}} + %0 = hlfir.eoshift %arg0 %arg1 dim %c10 : (!hlfir.expr<2x2xi32>, i32, index) -> !hlfir.expr<2x2xi32> + return +} + +// ----- + +func.func @bad_eoshift6(%arg0: !hlfir.expr<2x2xi32>, %arg1: !hlfir.expr<2x2xi32>) { + // expected-error@+1 {{'hlfir.eoshift' op SHIFT's rank must be 1 less than the input array's rank}} + %0 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr<2x2xi32>, !hlfir.expr<2x2xi32>) -> !hlfir.expr<2x2xi32> + return +} + +// ----- + +func.func @bad_eoshift7(%arg0: !hlfir.expr, %arg1: !hlfir.expr<3xi32>) { + %c1 = arith.constant 1 : index + // expected-error@+1 {{'hlfir.eoshift' op SHAPE(ARRAY)(2) must be equal to SHAPE(SHIFT)(1): 2 != 3}} + %0 = hlfir.eoshift %arg0 %arg1 dim %c1 : (!hlfir.expr, !hlfir.expr<3xi32>, index) -> !hlfir.expr<2x2xi32> + return +} + +// ----- + +func.func @bad_eoshift8(%arg0: !hlfir.expr>, %arg1: i32) { + // expected-error@+2 {{'hlfir.eoshift' op character KIND mismatch}} + // expected-error@+1 {{'hlfir.eoshift' op input and output arrays should have the same element type}} + %0 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr>, i32) -> !hlfir.expr> + return +} + +// ----- + +func.func @bad_eoshift9(%arg0: !hlfir.expr>, %arg1: i32) { + // expected-error@+2 {{'hlfir.eoshift' op character LEN mismatch}} + // expected-error@+1 {{'hlfir.eoshift' op input and output arrays should have the same element type}} + %0 = hlfir.eoshift %arg0 %arg1 : (!hlfir.expr>, i32) -> !hlfir.expr> + return +} + +// ----- + +func.func @bad_eoshift10(%arg0: !hlfir.expr<2x2xi32>, %arg1: i32, %arg2: f32) { + // expected-error@+1 {{'hlfir.eoshift' op ARRAY and BOUNDARY operands must have the same element type}} + %0 = hlfir.eoshift %arg0 %arg1 boundary %arg2 : (!hlfir.expr<2x2xi32>, i32, f32) -> !hlfir.expr<2x2xi32> + return +} + +// ----- + +func.func @bad_eoshift11(%arg0: !hlfir.expr<2x2xi32>, %arg1: i32, %arg2: !hlfir.expr<2x2xi32>) { + // expected-error@+1 {{'hlfir.eoshift' op BOUNDARY's rank must be 1 less than the input array's rank}} + %0 = hlfir.eoshift %arg0 %arg1 boundary %arg2 : (!hlfir.expr<2x2xi32>, i32, !hlfir.expr<2x2xi32>) -> !hlfir.expr<2x2xi32> + return +} diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir index 8684a429ea5b..f5af990da194 100644 --- a/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir +++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-cshift.fir @@ -38,12 +38,12 @@ func.func @cshift_vector(%arg0: !fir.box>, %arg1: !fir.ref>, index, index, index, !fir.shape<1>) -> !fir.box> // CHECK: %[[VAL_25:.*]] = fir.box_addr %[[VAL_24]] : (!fir.box>) -> !fir.ref> // CHECK: %[[VAL_26:.*]] = fir.embox %[[VAL_25]](%[[VAL_23]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// CHECK: %[[VAL_36:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow : i64 // CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_17]] : (i64) -> index // CHECK: fir.do_loop %[[VAL_28:.*]] = %[[VAL_2]] to %[[VAL_27]] step %[[VAL_2]] unordered { // CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (index) -> i64 // CHECK: %[[VAL_34:.*]] = hlfir.designate %[[VAL_26]] (%[[VAL_29]]) : (!fir.box>, i64) -> !fir.ref // CHECK: %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref -// CHECK: %[[VAL_36:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow : i64 // CHECK: %[[VAL_37:.*]] = arith.addi %[[VAL_29]], %[[VAL_36]] overflow : i64 // CHECK: %[[VAL_42:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_37]]) : (!fir.box>, i64) -> !fir.ref // CHECK: hlfir.assign %[[VAL_35]] to %[[VAL_42]] : i32, !fir.ref @@ -59,6 +59,7 @@ func.func @cshift_vector(%arg0: !fir.box>, %arg1: !fir.ref // CHECK: } // CHECK: } else { +// CHECK: %[[VAL_68:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow : i64 // CHECK: %[[VAL_59:.*]] = fir.convert %[[VAL_17]] : (i64) -> index // CHECK: fir.do_loop %[[VAL_60:.*]] = %[[VAL_2]] to %[[VAL_59]] step %[[VAL_2]] unordered { // CHECK: %[[VAL_61:.*]] = fir.convert %[[VAL_60]] : (index) -> i64 @@ -68,7 +69,6 @@ func.func @cshift_vector(%arg0: !fir.box>, %arg1: !fir.ref : index // CHECK: %[[VAL_66:.*]] = hlfir.designate %[[VAL_0]] (%[[VAL_65]]) : (!fir.box>, index) -> !fir.ref // CHECK: %[[VAL_67:.*]] = fir.load %[[VAL_66]] : !fir.ref -// CHECK: %[[VAL_68:.*]] = arith.subi %[[VAL_8]], %[[VAL_17]] overflow : i64 // CHECK: %[[VAL_69:.*]] = arith.addi %[[VAL_61]], %[[VAL_68]] overflow : i64 // CHECK: %[[VAL_74:.*]] = hlfir.designate %[[VAL_20]] (%[[VAL_69]]) : (!fir.box>, i64) -> !fir.ref // CHECK: hlfir.assign %[[VAL_67]] to %[[VAL_74]] : i32, !fir.ref diff --git a/flang/test/HLFIR/simplify-hlfir-intrinsics-eoshift.fir b/flang/test/HLFIR/simplify-hlfir-intrinsics-eoshift.fir new file mode 100644 index 000000000000..88191d517c2b --- /dev/null +++ b/flang/test/HLFIR/simplify-hlfir-intrinsics-eoshift.fir @@ -0,0 +1,2210 @@ +// Test hlfir.eoshift simplification to hlfir.elemental and hlfir.eval_in_mem: +// RUN: fir-opt --simplify-hlfir-intrinsics %s | FileCheck %s + +// module eoshift_types +// type t +// end type t +// end module eoshift_types +// +// ! Test contiguous 1D array with statically absent boundary. +// subroutine eoshift1(n, array) +// integer :: n +// real(2) :: array(n) +// array = EOSHIFT(array, 2) +// end subroutine +func.func @_QPeoshift1(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref> {fir.bindc_name = "array"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift1En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2 = fir.load %1#0 : !fir.ref + %3 = fir.convert %2 : (i32) -> index + %4 = arith.cmpi sgt, %3, %c0 : index + %5 = arith.select %4, %3, %c0 : index + %6 = fir.shape %5 : (index) -> !fir.shape<1> + %7:2 = hlfir.declare %arg1(%6) dummy_scope %0 {uniq_name = "_QFeoshift1Earray"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) + %8 = hlfir.eoshift %7#0 %c2_i32 : (!fir.box>, i32) -> !hlfir.expr + hlfir.assign %8 to %7#0 : !hlfir.expr, !fir.box> + hlfir.destroy %8 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPeoshift1( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f16 +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift1En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_4]] : index +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_4]] : index +// CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_11]]) dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift1Earray"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (index) -> i64 +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64 +// CHECK: %[[VAL_15:.*]] = hlfir.eval_in_mem shape %[[VAL_11]] : (!fir.shape<1>) -> !hlfir.expr { +// CHECK: ^bb0(%[[VAL_16:.*]]: !fir.ref>): +// CHECK: %[[VAL_17:.*]] = fir.embox %[[VAL_16]](%[[VAL_11]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// CHECK: %[[VAL_18:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_19:.*]] = arith.subi %[[VAL_1]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_20:.*]] = arith.select %[[VAL_18]], %[[VAL_19]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_18]], %[[VAL_1]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_22:.*]] = arith.subi %[[VAL_1]], %[[VAL_13]] overflow : i64 +// CHECK: %[[VAL_23:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_24:.*]] = arith.cmpi sgt, %[[VAL_22]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_25:.*]] = arith.select %[[VAL_24]], %[[VAL_1]], %[[VAL_23]] : i64 +// CHECK: %[[VAL_26:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_27:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_28:.*]] = arith.select %[[VAL_27]], %[[VAL_1]], %[[VAL_26]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.select %[[VAL_18]], %[[VAL_25]], %[[VAL_28]] : i64 +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_31:.*]] = %[[VAL_0]] to %[[VAL_30]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (index) -> i64 +// CHECK: %[[VAL_33:.*]] = arith.addi %[[VAL_32]], %[[VAL_21]] overflow : i64 +// CHECK: %[[VAL_34:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_33]]) : (!fir.box>, i64) -> !fir.ref +// CHECK: %[[VAL_35:.*]] = fir.load %[[VAL_34]] : !fir.ref +// CHECK: %[[VAL_36:.*]] = arith.addi %[[VAL_32]], %[[VAL_20]] overflow : i64 +// CHECK: %[[VAL_37:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_36]]) : (!fir.box>, i64) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_35]] to %[[VAL_37]] : f16, !fir.ref +// CHECK: } +// CHECK: %[[VAL_38:.*]] = arith.subi %[[VAL_13]], %[[VAL_29]] overflow : i64 +// CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_18]], %[[VAL_1]], %[[VAL_29]] : i64 +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_38]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_0]] to %[[VAL_40]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_41]] : (index) -> i64 +// CHECK: %[[VAL_43:.*]] = arith.addi %[[VAL_42]], %[[VAL_39]] overflow : i64 +// CHECK: %[[VAL_44:.*]] = hlfir.designate %[[VAL_17]] (%[[VAL_43]]) : (!fir.box>, i64) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_2]] to %[[VAL_44]] : f16, !fir.ref +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : !hlfir.expr, !fir.box> +// CHECK: hlfir.destroy %[[VAL_15]] : !hlfir.expr +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the scalar constant boundary. +// subroutine eoshift2(n, array) +// integer :: n +// logical(2) :: array(n) +// array = EOSHIFT(array, 2, boundary=.true._2, dim=1) +// end subroutine +func.func @_QPeoshift2(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref>> {fir.bindc_name = "array"}) { + %c1_i32 = arith.constant 1 : i32 + %true = arith.constant true + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift2En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2 = fir.load %1#0 : !fir.ref + %3 = fir.convert %2 : (i32) -> index + %4 = arith.cmpi sgt, %3, %c0 : index + %5 = arith.select %4, %3, %c0 : index + %6 = fir.shape %5 : (index) -> !fir.shape<1> + %7:2 = hlfir.declare %arg1(%6) dummy_scope %0 {uniq_name = "_QFeoshift2Earray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %8 = fir.convert %true : (i1) -> !fir.logical<2> + %9 = hlfir.eoshift %7#0 %c2_i32 boundary %8 dim %c1_i32 : (!fir.box>>, i32, !fir.logical<2>, i32) -> !hlfir.expr> + hlfir.assign %9 to %7#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %9 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift2( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref>> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant true +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift2En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_4]] : index +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_4]] : index +// CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_11]]) dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift2Earray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_2]] : (i1) -> !fir.logical<2> +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_10]] : (index) -> i64 +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64 +// CHECK: %[[VAL_16:.*]] = hlfir.eval_in_mem shape %[[VAL_11]] : (!fir.shape<1>) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_17:.*]]: !fir.ref>>): +// CHECK: %[[VAL_18:.*]] = fir.embox %[[VAL_17]](%[[VAL_11]]) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> +// CHECK: %[[VAL_19:.*]] = arith.cmpi slt, %[[VAL_15]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_20:.*]] = arith.subi %[[VAL_1]], %[[VAL_15]] overflow : i64 +// CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_19]], %[[VAL_1]], %[[VAL_15]] : i64 +// CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_1]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_14]], %[[VAL_15]] overflow : i64 +// CHECK: %[[VAL_25:.*]] = arith.cmpi sgt, %[[VAL_23]], %[[VAL_15]] : i64 +// CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_1]], %[[VAL_24]] : i64 +// CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_14]], %[[VAL_15]] overflow : i64 +// CHECK: %[[VAL_28:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_15]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_1]], %[[VAL_27]] : i64 +// CHECK: %[[VAL_30:.*]] = arith.select %[[VAL_19]], %[[VAL_26]], %[[VAL_29]] : i64 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_32:.*]] = %[[VAL_0]] to %[[VAL_31]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (index) -> i64 +// CHECK: %[[VAL_34:.*]] = arith.addi %[[VAL_33]], %[[VAL_22]] overflow : i64 +// CHECK: %[[VAL_35:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_34]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref> +// CHECK: %[[VAL_37:.*]] = arith.addi %[[VAL_33]], %[[VAL_21]] overflow : i64 +// CHECK: %[[VAL_38:.*]] = hlfir.designate %[[VAL_18]] (%[[VAL_37]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_36]] to %[[VAL_38]] : !fir.logical<2>, !fir.ref> +// CHECK: } +// CHECK: %[[VAL_39:.*]] = arith.subi %[[VAL_14]], %[[VAL_30]] overflow : i64 +// CHECK: %[[VAL_40:.*]] = arith.select %[[VAL_19]], %[[VAL_1]], %[[VAL_30]] : i64 +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_39]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_0]] to %[[VAL_41]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (index) -> i64 +// CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_40]] overflow : i64 +// CHECK: %[[VAL_45:.*]] = hlfir.designate %[[VAL_18]] (%[[VAL_44]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_13]] to %[[VAL_45]] : !fir.logical<2>, !fir.ref> +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_16]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the scalar always present boundary. +// subroutine eoshift3(n, array, boundary) +// integer :: n +// complex(2) :: array(n), boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift3(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref>> {fir.bindc_name = "array"}, %arg2: !fir.ref> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift3En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFeoshift3Eboundary"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %3 = fir.load %1#0 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %5 = arith.cmpi sgt, %4, %c0 : index + %6 = arith.select %5, %4, %c0 : index + %7 = fir.shape %6 : (index) -> !fir.shape<1> + %8:2 = hlfir.declare %arg1(%7) dummy_scope %0 {uniq_name = "_QFeoshift3Earray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %9 = hlfir.eoshift %8#0 %c2_i32 boundary %2#0 : (!fir.box>>, i32, !fir.ref>) -> !hlfir.expr> + hlfir.assign %9 to %8#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %9 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift3( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref>> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.ref> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift3En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift3Eboundary"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_3]] : index +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_3]] : index +// CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_11]]) dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift3Earray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (index) -> i64 +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 +// CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref> +// CHECK: %[[VAL_16:.*]] = hlfir.eval_in_mem shape %[[VAL_11]] : (!fir.shape<1>) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_17:.*]]: !fir.ref>>): +// CHECK: %[[VAL_18:.*]] = fir.embox %[[VAL_17]](%[[VAL_11]]) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> +// CHECK: %[[VAL_19:.*]] = arith.cmpi slt, %[[VAL_14]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_20:.*]] = arith.subi %[[VAL_1]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_19]], %[[VAL_20]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_19]], %[[VAL_1]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_23:.*]] = arith.subi %[[VAL_1]], %[[VAL_13]] overflow : i64 +// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_25:.*]] = arith.cmpi sgt, %[[VAL_23]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_1]], %[[VAL_24]] : i64 +// CHECK: %[[VAL_27:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_28:.*]] = arith.cmpi slt, %[[VAL_13]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.select %[[VAL_28]], %[[VAL_1]], %[[VAL_27]] : i64 +// CHECK: %[[VAL_30:.*]] = arith.select %[[VAL_19]], %[[VAL_26]], %[[VAL_29]] : i64 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_32:.*]] = %[[VAL_0]] to %[[VAL_31]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (index) -> i64 +// CHECK: %[[VAL_34:.*]] = arith.addi %[[VAL_33]], %[[VAL_22]] overflow : i64 +// CHECK: %[[VAL_35:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_34]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref> +// CHECK: %[[VAL_37:.*]] = arith.addi %[[VAL_33]], %[[VAL_21]] overflow : i64 +// CHECK: %[[VAL_38:.*]] = hlfir.designate %[[VAL_18]] (%[[VAL_37]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_36]] to %[[VAL_38]] : complex, !fir.ref> +// CHECK: } +// CHECK: %[[VAL_39:.*]] = arith.subi %[[VAL_13]], %[[VAL_30]] overflow : i64 +// CHECK: %[[VAL_40:.*]] = arith.select %[[VAL_19]], %[[VAL_1]], %[[VAL_30]] : i64 +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_39]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_42:.*]] = %[[VAL_0]] to %[[VAL_41]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (index) -> i64 +// CHECK: %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_40]] overflow : i64 +// CHECK: %[[VAL_45:.*]] = hlfir.designate %[[VAL_18]] (%[[VAL_44]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_15]] to %[[VAL_45]] : complex, !fir.ref> +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_16]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the scalar optional boundary. +// subroutine eoshift4(n, array, boundary) +// integer :: n +// logical :: array(n) +// logical, optional :: boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift4(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref>> {fir.bindc_name = "array"}, %arg2: !fir.ref> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift4En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg2 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift4Eboundary"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %3 = fir.load %1#0 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %5 = arith.cmpi sgt, %4, %c0 : index + %6 = arith.select %5, %4, %c0 : index + %7 = fir.shape %6 : (index) -> !fir.shape<1> + %8:2 = hlfir.declare %arg1(%7) dummy_scope %0 {uniq_name = "_QFeoshift4Earray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %9 = fir.is_present %2#0 : (!fir.ref>) -> i1 + %10 = fir.embox %2#0 : (!fir.ref>) -> !fir.box> + %11 = fir.absent !fir.box> + %12 = arith.select %9, %10, %11 : !fir.box> + %13 = hlfir.eoshift %8#0 %c2_i32 boundary %12 : (!fir.box>>, i32, !fir.box>) -> !hlfir.expr> + hlfir.assign %13 to %8#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %13 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift4( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref>> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.ref> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_1:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_2:.*]] = arith.constant false +// CHECK: %[[VAL_3:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift4En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift4Eboundary"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref +// CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> index +// CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_4]] : index +// CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_9]], %[[VAL_4]] : index +// CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_12]]) dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift4Earray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_14:.*]] = fir.is_present %[[VAL_7]]#0 : (!fir.ref>) -> i1 +// CHECK: %[[VAL_15:.*]] = fir.embox %[[VAL_7]]#0 : (!fir.ref>) -> !fir.box> +// CHECK: %[[VAL_16:.*]] = fir.absent !fir.box> +// CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_14]], %[[VAL_15]], %[[VAL_16]] : !fir.box> +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (index) -> i64 +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_3]] : (i32) -> i64 +// CHECK: %[[VAL_20:.*]] = fir.is_present %[[VAL_17]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_21:.*]] = fir.if %[[VAL_20]] -> (!fir.logical<4>) { +// CHECK: %[[VAL_22:.*]] = fir.box_addr %[[VAL_17]] : (!fir.box>) -> !fir.ref> +// CHECK: %[[VAL_23:.*]] = fir.load %[[VAL_22]] : !fir.ref> +// CHECK: fir.result %[[VAL_23]] : !fir.logical<4> +// CHECK: } else { +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_2]] : (i1) -> !fir.logical<4> +// CHECK: fir.result %[[VAL_24]] : !fir.logical<4> +// CHECK: } +// CHECK: %[[VAL_25:.*]] = hlfir.eval_in_mem shape %[[VAL_12]] : (!fir.shape<1>) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_26:.*]]: !fir.ref>>): +// CHECK: %[[VAL_27:.*]] = fir.embox %[[VAL_26]](%[[VAL_12]]) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> +// CHECK: %[[VAL_28:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.subi %[[VAL_1]], %[[VAL_19]] overflow : i64 +// CHECK: %[[VAL_30:.*]] = arith.select %[[VAL_28]], %[[VAL_29]], %[[VAL_1]] : i64 +// CHECK: %[[VAL_31:.*]] = arith.select %[[VAL_28]], %[[VAL_1]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_32:.*]] = arith.subi %[[VAL_1]], %[[VAL_18]] overflow : i64 +// CHECK: %[[VAL_33:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] overflow : i64 +// CHECK: %[[VAL_34:.*]] = arith.cmpi sgt, %[[VAL_32]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_35:.*]] = arith.select %[[VAL_34]], %[[VAL_1]], %[[VAL_33]] : i64 +// CHECK: %[[VAL_36:.*]] = arith.subi %[[VAL_18]], %[[VAL_19]] overflow : i64 +// CHECK: %[[VAL_37:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_38:.*]] = arith.select %[[VAL_37]], %[[VAL_1]], %[[VAL_36]] : i64 +// CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_28]], %[[VAL_35]], %[[VAL_38]] : i64 +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_41:.*]] = %[[VAL_0]] to %[[VAL_40]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_42:.*]] = fir.convert %[[VAL_41]] : (index) -> i64 +// CHECK: %[[VAL_43:.*]] = arith.addi %[[VAL_42]], %[[VAL_31]] overflow : i64 +// CHECK: %[[VAL_44:.*]] = hlfir.designate %[[VAL_13]]#0 (%[[VAL_43]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: %[[VAL_45:.*]] = fir.load %[[VAL_44]] : !fir.ref> +// CHECK: %[[VAL_46:.*]] = arith.addi %[[VAL_42]], %[[VAL_30]] overflow : i64 +// CHECK: %[[VAL_47:.*]] = hlfir.designate %[[VAL_27]] (%[[VAL_46]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_45]] to %[[VAL_47]] : !fir.logical<4>, !fir.ref> +// CHECK: } +// CHECK: %[[VAL_48:.*]] = arith.subi %[[VAL_18]], %[[VAL_39]] overflow : i64 +// CHECK: %[[VAL_49:.*]] = arith.select %[[VAL_28]], %[[VAL_1]], %[[VAL_39]] : i64 +// CHECK: %[[VAL_50:.*]] = fir.convert %[[VAL_48]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_51:.*]] = %[[VAL_0]] to %[[VAL_50]] step %[[VAL_0]] unordered { +// CHECK: %[[VAL_52:.*]] = fir.convert %[[VAL_51]] : (index) -> i64 +// CHECK: %[[VAL_53:.*]] = arith.addi %[[VAL_52]], %[[VAL_49]] overflow : i64 +// CHECK: %[[VAL_54:.*]] = hlfir.designate %[[VAL_27]] (%[[VAL_53]]) : (!fir.box>>, i64) -> !fir.ref> +// CHECK: hlfir.assign %[[VAL_21]] to %[[VAL_54]] : !fir.logical<4>, !fir.ref> +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_25]] to %[[VAL_13]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_25]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array always present boundary. +// subroutine eoshift5(n, array, boundary) +// integer :: n +// real :: array(n,n) +// real :: boundary(:) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift5(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref> {fir.bindc_name = "array"}, %arg2: !fir.box> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift5En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFeoshift5Eboundary"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) + %3 = fir.load %1#0 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %5 = arith.cmpi sgt, %4, %c0 : index + %6 = arith.select %5, %4, %c0 : index + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.shape %6, %10 : (index, index) -> !fir.shape<2> + %12:2 = hlfir.declare %arg1(%11) dummy_scope %0 {uniq_name = "_QFeoshift5Earray"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) + %13 = hlfir.eoshift %12#0 %c2_i32 boundary %2#0 : (!fir.box>, i32, !fir.box>) -> !hlfir.expr + hlfir.assign %13 to %12#0 : !hlfir.expr, !fir.box> + hlfir.destroy %13 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPeoshift5( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.box> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift5En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift5Eboundary"} : (!fir.box>, !fir.dscope) -> (!fir.box>, !fir.box>) +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_3]] : index +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_3]] : index +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_3]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_3]] : index +// CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_10]], %[[VAL_14]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_15]]) dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift5Earray"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_10]] : (index) -> i64 +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 +// CHECK: %[[VAL_19:.*]] = hlfir.eval_in_mem shape %[[VAL_15]] : (!fir.shape<2>) -> !hlfir.expr { +// CHECK: ^bb0(%[[VAL_20:.*]]: !fir.ref>): +// CHECK: %[[VAL_21:.*]] = fir.embox %[[VAL_20]](%[[VAL_15]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: fir.do_loop %[[VAL_22:.*]] = %[[VAL_1]] to %[[VAL_14]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_23:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_22]]) : (!fir.box>, index) -> !fir.ref +// CHECK: %[[VAL_24:.*]] = fir.load %[[VAL_23]] : !fir.ref +// CHECK: %[[VAL_25:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_26:.*]] = arith.subi %[[VAL_0]], %[[VAL_18]] overflow : i64 +// CHECK: %[[VAL_27:.*]] = arith.select %[[VAL_25]], %[[VAL_26]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_28:.*]] = arith.select %[[VAL_25]], %[[VAL_0]], %[[VAL_18]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.subi %[[VAL_0]], %[[VAL_17]] overflow : i64 +// CHECK: %[[VAL_30:.*]] = arith.addi %[[VAL_17]], %[[VAL_18]] overflow : i64 +// CHECK: %[[VAL_31:.*]] = arith.cmpi sgt, %[[VAL_29]], %[[VAL_18]] : i64 +// CHECK: %[[VAL_32:.*]] = arith.select %[[VAL_31]], %[[VAL_0]], %[[VAL_30]] : i64 +// CHECK: %[[VAL_33:.*]] = arith.subi %[[VAL_17]], %[[VAL_18]] overflow : i64 +// CHECK: %[[VAL_34:.*]] = arith.cmpi slt, %[[VAL_17]], %[[VAL_18]] : i64 +// CHECK: %[[VAL_35:.*]] = arith.select %[[VAL_34]], %[[VAL_0]], %[[VAL_33]] : i64 +// CHECK: %[[VAL_36:.*]] = arith.select %[[VAL_25]], %[[VAL_32]], %[[VAL_35]] : i64 +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_38:.*]] = %[[VAL_1]] to %[[VAL_37]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (index) -> i64 +// CHECK: %[[VAL_40:.*]] = arith.addi %[[VAL_39]], %[[VAL_28]] overflow : i64 +// CHECK: %[[VAL_41:.*]] = hlfir.designate %[[VAL_16]]#0 (%[[VAL_40]], %[[VAL_22]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: %[[VAL_42:.*]] = fir.load %[[VAL_41]] : !fir.ref +// CHECK: %[[VAL_43:.*]] = arith.addi %[[VAL_39]], %[[VAL_27]] overflow : i64 +// CHECK: %[[VAL_44:.*]] = hlfir.designate %[[VAL_21]] (%[[VAL_43]], %[[VAL_22]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_42]] to %[[VAL_44]] : f32, !fir.ref +// CHECK: } +// CHECK: %[[VAL_45:.*]] = arith.subi %[[VAL_17]], %[[VAL_36]] overflow : i64 +// CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_25]], %[[VAL_0]], %[[VAL_36]] : i64 +// CHECK: %[[VAL_47:.*]] = fir.convert %[[VAL_45]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_48:.*]] = %[[VAL_1]] to %[[VAL_47]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_49:.*]] = fir.convert %[[VAL_48]] : (index) -> i64 +// CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_49]], %[[VAL_46]] overflow : i64 +// CHECK: %[[VAL_51:.*]] = hlfir.designate %[[VAL_21]] (%[[VAL_50]], %[[VAL_22]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_24]] to %[[VAL_51]] : f32, !fir.ref +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_19]] to %[[VAL_16]]#0 : !hlfir.expr, !fir.box> +// CHECK: hlfir.destroy %[[VAL_19]] : !hlfir.expr +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array optional boundary. +// subroutine eoshift6(n, array, boundary) +// integer :: n +// real :: array(n,n) +// real, optional :: boundary(n) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift6(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref> {fir.bindc_name = "array"}, %arg2: !fir.ref> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift6En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2 = fir.load %1#0 : !fir.ref + %3 = fir.convert %2 : (i32) -> index + %4 = arith.cmpi sgt, %3, %c0 : index + %5 = arith.select %4, %3, %c0 : index + %6 = fir.load %1#0 : !fir.ref + %7 = fir.convert %6 : (i32) -> index + %8 = arith.cmpi sgt, %7, %c0 : index + %9 = arith.select %8, %7, %c0 : index + %10 = fir.shape %5, %9 : (index, index) -> !fir.shape<2> + %11:2 = hlfir.declare %arg1(%10) dummy_scope %0 {uniq_name = "_QFeoshift6Earray"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) + %12 = fir.load %1#0 : !fir.ref + %13 = fir.convert %12 : (i32) -> index + %14 = arith.cmpi sgt, %13, %c0 : index + %15 = arith.select %14, %13, %c0 : index + %16 = fir.shape %15 : (index) -> !fir.shape<1> + %17:2 = hlfir.declare %arg2(%16) dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift6Eboundary"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) + %18 = fir.is_present %17#0 : (!fir.box>) -> i1 + %19 = fir.shape %15 : (index) -> !fir.shape<1> + %20 = fir.embox %17#1(%19) : (!fir.ref>, !fir.shape<1>) -> !fir.box> + %21 = fir.absent !fir.box> + %22 = arith.select %18, %20, %21 : !fir.box> + %23 = hlfir.eoshift %11#0 %c2_i32 boundary %22 : (!fir.box>, i32, !fir.box>) -> !hlfir.expr + hlfir.assign %23 to %11#0 : !hlfir.expr, !fir.box> + hlfir.destroy %23 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPeoshift6( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.ref> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant false +// CHECK: %[[VAL_3:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32 +// CHECK: %[[VAL_5:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift6En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index +// CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_6]] : index +// CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_6]] : index +// CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> index +// CHECK: %[[VAL_15:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_6]] : index +// CHECK: %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_14]], %[[VAL_6]] : index +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_12]], %[[VAL_16]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_17]]) dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift6Earray"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) +// CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> index +// CHECK: %[[VAL_21:.*]] = arith.cmpi sgt, %[[VAL_20]], %[[VAL_6]] : index +// CHECK: %[[VAL_22:.*]] = arith.select %[[VAL_21]], %[[VAL_20]], %[[VAL_6]] : index +// CHECK: %[[VAL_23:.*]] = fir.shape %[[VAL_22]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_24:.*]]:2 = hlfir.declare %[[ARG2]](%[[VAL_23]]) dummy_scope %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift6Eboundary"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.box>, !fir.ref>) +// CHECK: %[[VAL_25:.*]] = fir.is_present %[[VAL_24]]#0 : (!fir.box>) -> i1 +// CHECK: %[[VAL_26:.*]] = fir.shape %[[VAL_22]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_27:.*]] = fir.embox %[[VAL_24]]#1(%[[VAL_26]]) : (!fir.ref>, !fir.shape<1>) -> !fir.box> +// CHECK: %[[VAL_28:.*]] = fir.absent !fir.box> +// CHECK: %[[VAL_29:.*]] = arith.select %[[VAL_25]], %[[VAL_27]], %[[VAL_28]] : !fir.box> +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_12]] : (index) -> i64 +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_5]] : (i32) -> i64 +// CHECK: %[[VAL_32:.*]] = fir.is_present %[[VAL_29]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_33:.*]] = arith.select %[[VAL_32]], %[[VAL_2]], %[[VAL_3]] : i1 +// CHECK: %[[VAL_34:.*]] = hlfir.eval_in_mem shape %[[VAL_17]] : (!fir.shape<2>) -> !hlfir.expr { +// CHECK: ^bb0(%[[VAL_35:.*]]: !fir.ref>): +// CHECK: %[[VAL_36:.*]] = fir.embox %[[VAL_35]](%[[VAL_17]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: fir.do_loop %[[VAL_37:.*]] = %[[VAL_1]] to %[[VAL_16]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_38:.*]] = fir.if %[[VAL_33]] -> (f32) { +// CHECK: fir.result %[[VAL_4]] : f32 +// CHECK: } else { +// CHECK: %[[VAL_39:.*]]:3 = fir.box_dims %[[VAL_29]], %[[VAL_6]] : (!fir.box>, index) -> (index, index, index) +// CHECK: %[[VAL_40:.*]] = arith.subi %[[VAL_39]]#0, %[[VAL_1]] overflow : index +// CHECK: %[[VAL_41:.*]] = arith.addi %[[VAL_37]], %[[VAL_40]] overflow : index +// CHECK: %[[VAL_42:.*]] = hlfir.designate %[[VAL_29]] (%[[VAL_41]]) : (!fir.box>, index) -> !fir.ref +// CHECK: %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref +// CHECK: fir.result %[[VAL_43]] : f32 +// CHECK: } +// CHECK: %[[VAL_44:.*]] = arith.cmpi slt, %[[VAL_31]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_45:.*]] = arith.subi %[[VAL_0]], %[[VAL_31]] overflow : i64 +// CHECK: %[[VAL_46:.*]] = arith.select %[[VAL_44]], %[[VAL_45]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_47:.*]] = arith.select %[[VAL_44]], %[[VAL_0]], %[[VAL_31]] : i64 +// CHECK: %[[VAL_48:.*]] = arith.subi %[[VAL_0]], %[[VAL_30]] overflow : i64 +// CHECK: %[[VAL_49:.*]] = arith.addi %[[VAL_30]], %[[VAL_31]] overflow : i64 +// CHECK: %[[VAL_50:.*]] = arith.cmpi sgt, %[[VAL_48]], %[[VAL_31]] : i64 +// CHECK: %[[VAL_51:.*]] = arith.select %[[VAL_50]], %[[VAL_0]], %[[VAL_49]] : i64 +// CHECK: %[[VAL_52:.*]] = arith.subi %[[VAL_30]], %[[VAL_31]] overflow : i64 +// CHECK: %[[VAL_53:.*]] = arith.cmpi slt, %[[VAL_30]], %[[VAL_31]] : i64 +// CHECK: %[[VAL_54:.*]] = arith.select %[[VAL_53]], %[[VAL_0]], %[[VAL_52]] : i64 +// CHECK: %[[VAL_55:.*]] = arith.select %[[VAL_44]], %[[VAL_51]], %[[VAL_54]] : i64 +// CHECK: %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_57:.*]] = %[[VAL_1]] to %[[VAL_56]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_58:.*]] = fir.convert %[[VAL_57]] : (index) -> i64 +// CHECK: %[[VAL_59:.*]] = arith.addi %[[VAL_58]], %[[VAL_47]] overflow : i64 +// CHECK: %[[VAL_60:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_59]], %[[VAL_37]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: %[[VAL_61:.*]] = fir.load %[[VAL_60]] : !fir.ref +// CHECK: %[[VAL_62:.*]] = arith.addi %[[VAL_58]], %[[VAL_46]] overflow : i64 +// CHECK: %[[VAL_63:.*]] = hlfir.designate %[[VAL_36]] (%[[VAL_62]], %[[VAL_37]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_61]] to %[[VAL_63]] : f32, !fir.ref +// CHECK: } +// CHECK: %[[VAL_64:.*]] = arith.subi %[[VAL_30]], %[[VAL_55]] overflow : i64 +// CHECK: %[[VAL_65:.*]] = arith.select %[[VAL_44]], %[[VAL_0]], %[[VAL_55]] : i64 +// CHECK: %[[VAL_66:.*]] = fir.convert %[[VAL_64]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_67:.*]] = %[[VAL_1]] to %[[VAL_66]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_68:.*]] = fir.convert %[[VAL_67]] : (index) -> i64 +// CHECK: %[[VAL_69:.*]] = arith.addi %[[VAL_68]], %[[VAL_65]] overflow : i64 +// CHECK: %[[VAL_70:.*]] = hlfir.designate %[[VAL_36]] (%[[VAL_69]], %[[VAL_37]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_38]] to %[[VAL_70]] : f32, !fir.ref +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_34]] to %[[VAL_18]]#0 : !hlfir.expr, !fir.box> +// CHECK: hlfir.destroy %[[VAL_34]] : !hlfir.expr +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array expression boundary. +// subroutine eoshift7(n, array) +// interface +// function real_boundary(n) +// integer :: n +// real :: real_boundary(n) +// end function +// end interface +// integer :: n +// real :: array(n,n) +// array = EOSHIFT(array, 2, real_boundary(n)) +// end subroutine +func.func @_QPeoshift7(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref> {fir.bindc_name = "array"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift7En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2 = fir.load %1#0 : !fir.ref + %3 = fir.convert %2 : (i32) -> index + %4 = arith.cmpi sgt, %3, %c0 : index + %5 = arith.select %4, %3, %c0 : index + %6 = fir.load %1#0 : !fir.ref + %7 = fir.convert %6 : (i32) -> index + %8 = arith.cmpi sgt, %7, %c0 : index + %9 = arith.select %8, %7, %c0 : index + %10 = fir.shape %5, %9 : (index, index) -> !fir.shape<2> + %11:2 = hlfir.declare %arg1(%10) dummy_scope %0 {uniq_name = "_QFeoshift7Earray"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) + %12:2 = hlfir.declare %1#0 {uniq_name = "_QFeoshift7Freal_boundaryEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %13 = fir.load %12#0 : !fir.ref + %14 = fir.convert %13 : (i32) -> index + %15 = arith.cmpi sgt, %14, %c0 : index + %16 = arith.select %15, %14, %c0 : index + %17 = fir.shape %16 : (index) -> !fir.shape<1> + %18 = hlfir.eval_in_mem shape %17 : (!fir.shape<1>) -> !hlfir.expr { + ^bb0(%arg2: !fir.ref>): + %20 = fir.call @_QPreal_boundary(%1#0) fastmath : (!fir.ref) -> !fir.array + fir.save_result %20 to %arg2(%17) : !fir.array, !fir.ref>, !fir.shape<1> + } + %19 = hlfir.eoshift %11#0 %c2_i32 boundary %18 : (!fir.box>, i32, !hlfir.expr) -> !hlfir.expr + hlfir.assign %19 to %11#0 : !hlfir.expr, !fir.box> + hlfir.destroy %19 : !hlfir.expr + hlfir.destroy %18 : !hlfir.expr + return +} +// CHECK-LABEL: func.func @_QPeoshift7( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.ref> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 0 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift7En"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (i32) -> index +// CHECK: %[[VAL_8:.*]] = arith.cmpi sgt, %[[VAL_7]], %[[VAL_3]] : index +// CHECK: %[[VAL_9:.*]] = arith.select %[[VAL_8]], %[[VAL_7]], %[[VAL_3]] : index +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> index +// CHECK: %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_3]] : index +// CHECK: %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_3]] : index +// CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_9]], %[[VAL_13]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[ARG1]](%[[VAL_14]]) dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift7Earray"} : (!fir.ref>, !fir.shape<2>, !fir.dscope) -> (!fir.box>, !fir.ref>) +// CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_5]]#0 {uniq_name = "_QFeoshift7Freal_boundaryEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_17:.*]] = fir.load %[[VAL_16]]#0 : !fir.ref +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (i32) -> index +// CHECK: %[[VAL_19:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_3]] : index +// CHECK: %[[VAL_20:.*]] = arith.select %[[VAL_19]], %[[VAL_18]], %[[VAL_3]] : index +// CHECK: %[[VAL_21:.*]] = fir.shape %[[VAL_20]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_22:.*]] = hlfir.eval_in_mem shape %[[VAL_21]] : (!fir.shape<1>) -> !hlfir.expr { +// CHECK: ^bb0(%[[VAL_23:.*]]: !fir.ref>): +// CHECK: %[[VAL_24:.*]] = fir.call @_QPreal_boundary(%[[VAL_5]]#0) fastmath : (!fir.ref) -> !fir.array +// CHECK: fir.save_result %[[VAL_24]] to %[[VAL_23]](%[[VAL_21]]) : !fir.array, !fir.ref>, !fir.shape<1> +// CHECK: } +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_9]] : (index) -> i64 +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 +// CHECK: %[[VAL_27:.*]] = hlfir.eval_in_mem shape %[[VAL_14]] : (!fir.shape<2>) -> !hlfir.expr { +// CHECK: ^bb0(%[[VAL_28:.*]]: !fir.ref>): +// CHECK: %[[VAL_29:.*]] = fir.embox %[[VAL_28]](%[[VAL_14]]) : (!fir.ref>, !fir.shape<2>) -> !fir.box> +// CHECK: fir.do_loop %[[VAL_30:.*]] = %[[VAL_1]] to %[[VAL_13]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_31:.*]] = hlfir.apply %[[VAL_22]], %[[VAL_30]] : (!hlfir.expr, index) -> f32 +// CHECK: %[[VAL_32:.*]] = arith.cmpi slt, %[[VAL_26]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_33:.*]] = arith.subi %[[VAL_0]], %[[VAL_26]] overflow : i64 +// CHECK: %[[VAL_34:.*]] = arith.select %[[VAL_32]], %[[VAL_33]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_35:.*]] = arith.select %[[VAL_32]], %[[VAL_0]], %[[VAL_26]] : i64 +// CHECK: %[[VAL_36:.*]] = arith.subi %[[VAL_0]], %[[VAL_25]] overflow : i64 +// CHECK: %[[VAL_37:.*]] = arith.addi %[[VAL_25]], %[[VAL_26]] overflow : i64 +// CHECK: %[[VAL_38:.*]] = arith.cmpi sgt, %[[VAL_36]], %[[VAL_26]] : i64 +// CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_38]], %[[VAL_0]], %[[VAL_37]] : i64 +// CHECK: %[[VAL_40:.*]] = arith.subi %[[VAL_25]], %[[VAL_26]] overflow : i64 +// CHECK: %[[VAL_41:.*]] = arith.cmpi slt, %[[VAL_25]], %[[VAL_26]] : i64 +// CHECK: %[[VAL_42:.*]] = arith.select %[[VAL_41]], %[[VAL_0]], %[[VAL_40]] : i64 +// CHECK: %[[VAL_43:.*]] = arith.select %[[VAL_32]], %[[VAL_39]], %[[VAL_42]] : i64 +// CHECK: %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_45:.*]] = %[[VAL_1]] to %[[VAL_44]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_46:.*]] = fir.convert %[[VAL_45]] : (index) -> i64 +// CHECK: %[[VAL_47:.*]] = arith.addi %[[VAL_46]], %[[VAL_35]] overflow : i64 +// CHECK: %[[VAL_48:.*]] = hlfir.designate %[[VAL_15]]#0 (%[[VAL_47]], %[[VAL_30]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: %[[VAL_49:.*]] = fir.load %[[VAL_48]] : !fir.ref +// CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_46]], %[[VAL_34]] overflow : i64 +// CHECK: %[[VAL_51:.*]] = hlfir.designate %[[VAL_29]] (%[[VAL_50]], %[[VAL_30]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_49]] to %[[VAL_51]] : f32, !fir.ref +// CHECK: } +// CHECK: %[[VAL_52:.*]] = arith.subi %[[VAL_25]], %[[VAL_43]] overflow : i64 +// CHECK: %[[VAL_53:.*]] = arith.select %[[VAL_32]], %[[VAL_0]], %[[VAL_43]] : i64 +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (i64) -> index +// CHECK: fir.do_loop %[[VAL_55:.*]] = %[[VAL_1]] to %[[VAL_54]] step %[[VAL_1]] unordered { +// CHECK: %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (index) -> i64 +// CHECK: %[[VAL_57:.*]] = arith.addi %[[VAL_56]], %[[VAL_53]] overflow : i64 +// CHECK: %[[VAL_58:.*]] = hlfir.designate %[[VAL_29]] (%[[VAL_57]], %[[VAL_30]]) : (!fir.box>, i64, index) -> !fir.ref +// CHECK: hlfir.assign %[[VAL_31]] to %[[VAL_58]] : f32, !fir.ref +// CHECK: } +// CHECK: } +// CHECK: } +// CHECK: hlfir.assign %[[VAL_27]] to %[[VAL_15]]#0 : !hlfir.expr, !fir.box> +// CHECK: hlfir.destroy %[[VAL_27]] : !hlfir.expr +// CHECK: hlfir.destroy %[[VAL_22]] : !hlfir.expr +// CHECK: return +// CHECK: } + +// ! Tests for CHARACTER type (lowered via hlfir.elemental). + +// ! Test contiguous 1D array with statically absent boundary. +// ! CHARACTER with constant length. +// subroutine eoshift1c(n, array) +// integer :: n +// character(10,1) :: array(n) +// array = EOSHIFT(array, 2) +// end subroutine +func.func @_QPeoshift1c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift1cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = fir.convert %4 : (i32) -> index + %6 = arith.cmpi sgt, %5, %c0 : index + %7 = arith.select %6, %5, %c0 : index + %8 = fir.shape %7 : (index) -> !fir.shape<1> + %9:2 = hlfir.declare %3(%8) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift1cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %10 = hlfir.eoshift %9#0 %c2_i32 : (!fir.box>>, i32) -> !hlfir.expr> + hlfir.assign %10 to %9#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %10 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift1c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift1cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> index +// CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_2]] : index +// CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_9]], %[[VAL_2]] : index +// CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_12]]) typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift1cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_11]] : (index) -> i64 +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_16:.*]] = fir.alloca !fir.char<1,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_17:.*]] = fir.emboxchar %[[VAL_16]], %[[VAL_2]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_18:.*]] = hlfir.elemental %[[VAL_12]] typeparams %[[VAL_3]] unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_19:.*]]: index): +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (index) -> i64 +// CHECK: %[[VAL_21:.*]] = arith.addi %[[VAL_20]], %[[VAL_15]] overflow : i64 +// CHECK: %[[VAL_22:.*]] = arith.cmpi sge, %[[VAL_21]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_23:.*]] = arith.cmpi sle, %[[VAL_21]], %[[VAL_14]] : i64 +// CHECK: %[[VAL_24:.*]] = arith.andi %[[VAL_22]], %[[VAL_23]] : i1 +// CHECK: %[[VAL_25:.*]] = fir.if %[[VAL_24]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_21]] : (i64) -> index +// CHECK: %[[VAL_27:.*]] = hlfir.designate %[[VAL_13]]#0 (%[[VAL_26]]) typeparams %[[VAL_3]] : (!fir.box>>, index, index) -> !fir.ref> +// CHECK: %[[VAL_28:.*]] = fir.emboxchar %[[VAL_27]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_28]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_17]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_25]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_18]] to %[[VAL_13]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_18]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with statically absent boundary. +// ! CHARACTER with variable length. +// subroutine eoshift2c(n, array) +// integer :: n +// character(n,1) :: array(n) +// array = EOSHIFT(array, 2) +// end subroutine +func.func @_QPeoshift2c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift2cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = arith.cmpi sgt, %4, %c0_i32 : i32 + %6 = arith.select %5, %4, %c0_i32 : i32 + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.shape %10 : (index) -> !fir.shape<1> + %12:2 = hlfir.declare %3(%11) typeparams %6 dummy_scope %0 {uniq_name = "_QFeoshift2cEarray"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %13 = hlfir.eoshift %12#0 %c2_i32 : (!fir.box>>, i32) -> !hlfir.expr> + hlfir.assign %13 to %12#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %13 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift2c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift2cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_15]]) typeparams %[[VAL_10]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift2cEarray"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_19:.*]] = fir.alloca !fir.char<1,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_20:.*]] = fir.emboxchar %[[VAL_19]], %[[VAL_2]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_21:.*]] = hlfir.elemental %[[VAL_15]] typeparams %[[VAL_10]] unordered : (!fir.shape<1>, i32) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_22:.*]]: index): +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (index) -> i64 +// CHECK: %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_18]] overflow : i64 +// CHECK: %[[VAL_25:.*]] = arith.cmpi sge, %[[VAL_24]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_26:.*]] = arith.cmpi sle, %[[VAL_24]], %[[VAL_17]] : i64 +// CHECK: %[[VAL_27:.*]] = arith.andi %[[VAL_25]], %[[VAL_26]] : i1 +// CHECK: %[[VAL_28:.*]] = fir.if %[[VAL_27]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_29:.*]] = fir.convert %[[VAL_24]] : (i64) -> index +// CHECK: %[[VAL_30:.*]] = hlfir.designate %[[VAL_16]]#0 (%[[VAL_29]]) typeparams %[[VAL_10]] : (!fir.box>>, index, i32) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_30]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_20]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_28]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_21]] to %[[VAL_16]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_21]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with statically absent boundary. +// ! CHARACTER with assumed length. +// subroutine eoshift3c(n, array) +// integer :: n +// character(*,1) :: array(n) +// array = EOSHIFT(array, 2) +// end subroutine +func.func @_QPeoshift3c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift3cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = fir.convert %4 : (i32) -> index + %6 = arith.cmpi sgt, %5, %c0 : index + %7 = arith.select %6, %5, %c0 : index + %8 = fir.shape %7 : (index) -> !fir.shape<1> + %9:2 = hlfir.declare %3(%8) typeparams %2#1 dummy_scope %0 {uniq_name = "_QFeoshift3cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %10 = hlfir.eoshift %9#0 %c2_i32 : (!fir.box>>, i32) -> !hlfir.expr> + hlfir.assign %10 to %9#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %10 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift3c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift3cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_7:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i32) -> index +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_2]] : index +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_2]] : index +// CHECK: %[[VAL_11:.*]] = fir.shape %[[VAL_10]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_6]](%[[VAL_11]]) typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift3cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_10]] : (index) -> i64 +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_15:.*]] = fir.alloca !fir.char<1,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_16:.*]] = fir.emboxchar %[[VAL_15]], %[[VAL_2]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_17:.*]] = hlfir.elemental %[[VAL_11]] typeparams %[[VAL_5]]#1 unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_18:.*]]: index): +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (index) -> i64 +// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_14]] overflow : i64 +// CHECK: %[[VAL_21:.*]] = arith.cmpi sge, %[[VAL_20]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_22:.*]] = arith.cmpi sle, %[[VAL_20]], %[[VAL_13]] : i64 +// CHECK: %[[VAL_23:.*]] = arith.andi %[[VAL_21]], %[[VAL_22]] : i1 +// CHECK: %[[VAL_24:.*]] = fir.if %[[VAL_23]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_20]] : (i64) -> index +// CHECK: %[[VAL_26:.*]] = hlfir.designate %[[VAL_12]]#0 (%[[VAL_25]]) typeparams %[[VAL_5]]#1 : (!fir.box>>, index, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_26]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_16]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_24]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_17]] to %[[VAL_12]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_17]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with scalar constant boundary. +// subroutine eoshift4c(n, array) +// integer :: n +// character(10,1) :: array(n) +// array = EOSHIFT(array, 2, '0123456789') +// end subroutine +func.func @_QPeoshift4c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift4cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = fir.convert %4 : (i32) -> index + %6 = arith.cmpi sgt, %5, %c0 : index + %7 = arith.select %6, %5, %c0 : index + %8 = fir.shape %7 : (index) -> !fir.shape<1> + %9:2 = hlfir.declare %3(%8) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift4cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %10 = fir.address_of(@_QQclX30313233343536373839) : !fir.ref> + %11:2 = hlfir.declare %10 typeparams %c10 {fortran_attrs = #fir.var_attrs, uniq_name = "_QQclX30313233343536373839"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) + %12 = hlfir.eoshift %9#0 %c2_i32 boundary %11#0 : (!fir.box>>, i32, !fir.ref>) -> !hlfir.expr> + hlfir.assign %12 to %9#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %12 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift4c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift4cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i32) -> index +// CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_2]] : index +// CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_9]], %[[VAL_2]] : index +// CHECK: %[[VAL_12:.*]] = fir.shape %[[VAL_11]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_12]]) typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift4cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_14:.*]] = fir.address_of(@_QQclX30313233343536373839) : !fir.ref> +// CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] typeparams %[[VAL_3]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QQclX30313233343536373839"} : (!fir.ref>, index) -> (!fir.ref>, !fir.ref>) +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_11]] : (index) -> i64 +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_18:.*]] = fir.emboxchar %[[VAL_15]]#0, %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_19:.*]] = hlfir.elemental %[[VAL_12]] typeparams %[[VAL_3]] unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_20:.*]]: index): +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (index) -> i64 +// CHECK: %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_17]] overflow : i64 +// CHECK: %[[VAL_23:.*]] = arith.cmpi sge, %[[VAL_22]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_24:.*]] = arith.cmpi sle, %[[VAL_22]], %[[VAL_16]] : i64 +// CHECK: %[[VAL_25:.*]] = arith.andi %[[VAL_23]], %[[VAL_24]] : i1 +// CHECK: %[[VAL_26:.*]] = fir.if %[[VAL_25]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_22]] : (i64) -> index +// CHECK: %[[VAL_28:.*]] = hlfir.designate %[[VAL_13]]#0 (%[[VAL_27]]) typeparams %[[VAL_3]] : (!fir.box>>, index, index) -> !fir.ref> +// CHECK: %[[VAL_29:.*]] = fir.emboxchar %[[VAL_28]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_29]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_18]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_26]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_19]] to %[[VAL_13]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_19]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with scalar always present boundary. +// ! CHARACTER with constant length. +// subroutine eoshift5c(n, array, boundary) +// integer :: n +// character(10,1) :: array(n), boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift5c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<1> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift5cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref> + %4:2 = hlfir.declare %3 typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift5cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %5:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %6 = fir.convert %5#0 : (!fir.ref>) -> !fir.ref>> + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.shape %10 : (index) -> !fir.shape<1> + %12:2 = hlfir.declare %6(%11) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift5cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %13 = hlfir.eoshift %12#0 %c2_i32 boundary %4#0 : (!fir.box>>, i32, !fir.ref>) -> !hlfir.expr> + hlfir.assign %13 to %12#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %13 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift5c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift5cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref> +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift5cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.ref>, !fir.ref>) +// CHECK: %[[VAL_9:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_15]]) typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift5cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_17:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 +// CHECK: %[[VAL_18:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_19:.*]] = fir.emboxchar %[[VAL_8]]#0, %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_20:.*]] = hlfir.elemental %[[VAL_15]] typeparams %[[VAL_3]] unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_21:.*]]: index): +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_21]] : (index) -> i64 +// CHECK: %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_18]] overflow : i64 +// CHECK: %[[VAL_24:.*]] = arith.cmpi sge, %[[VAL_23]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_25:.*]] = arith.cmpi sle, %[[VAL_23]], %[[VAL_17]] : i64 +// CHECK: %[[VAL_26:.*]] = arith.andi %[[VAL_24]], %[[VAL_25]] : i1 +// CHECK: %[[VAL_27:.*]] = fir.if %[[VAL_26]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_23]] : (i64) -> index +// CHECK: %[[VAL_29:.*]] = hlfir.designate %[[VAL_16]]#0 (%[[VAL_28]]) typeparams %[[VAL_3]] : (!fir.box>>, index, index) -> !fir.ref> +// CHECK: %[[VAL_30:.*]] = fir.emboxchar %[[VAL_29]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_30]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_19]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_27]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_20]] to %[[VAL_16]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_20]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with scalar always present boundary. +// ! CHARACTER with variable length. +// subroutine eoshift6c(n, array, boundary) +// integer :: n +// character(n,1) :: array(n), boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift6c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<1> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift6cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = arith.cmpi sgt, %4, %c0_i32 : i32 + %6 = arith.select %5, %4, %c0_i32 : i32 + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.shape %10 : (index) -> !fir.shape<1> + %12:2 = hlfir.declare %3(%11) typeparams %6 dummy_scope %0 {uniq_name = "_QFeoshift6cEarray"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %13:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %14 = fir.load %1#0 : !fir.ref + %15 = arith.cmpi sgt, %14, %c0_i32 : i32 + %16 = arith.select %15, %14, %c0_i32 : i32 + %17:2 = hlfir.declare %13#0 typeparams %16 dummy_scope %0 {uniq_name = "_QFeoshift6cEboundary"} : (!fir.ref>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + %18 = hlfir.eoshift %12#0 %c2_i32 boundary %17#0 : (!fir.box>>, i32, !fir.boxchar<1>) -> !hlfir.expr> + hlfir.assign %18 to %12#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %18 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift6c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift6cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_15]]) typeparams %[[VAL_10]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift6cEarray"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_17:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_19:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_20:.*]] = arith.select %[[VAL_19]], %[[VAL_18]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]]#0 typeparams %[[VAL_20]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift6cEboundary"} : (!fir.ref>, i32, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_24:.*]] = hlfir.elemental %[[VAL_15]] typeparams %[[VAL_10]] unordered : (!fir.shape<1>, i32) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_25:.*]]: index): +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (index) -> i64 +// CHECK: %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_23]] overflow : i64 +// CHECK: %[[VAL_28:.*]] = arith.cmpi sge, %[[VAL_27]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.cmpi sle, %[[VAL_27]], %[[VAL_22]] : i64 +// CHECK: %[[VAL_30:.*]] = arith.andi %[[VAL_28]], %[[VAL_29]] : i1 +// CHECK: %[[VAL_31:.*]] = fir.if %[[VAL_30]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_27]] : (i64) -> index +// CHECK: %[[VAL_33:.*]] = hlfir.designate %[[VAL_16]]#0 (%[[VAL_32]]) typeparams %[[VAL_10]] : (!fir.box>>, index, i32) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_33]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_21]]#0 : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_31]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_24]] to %[[VAL_16]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_24]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with scalar always present boundary. +// ! CHARACTER with assumed length. +// subroutine eoshift7c(n, array, boundary) +// integer :: n +// character(*,1) :: array(n), boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift7c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<1> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift7cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3:2 = hlfir.declare %2#0 typeparams %2#1 dummy_scope %0 {uniq_name = "_QFeoshift7cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) + %4:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %5 = fir.convert %4#0 : (!fir.ref>) -> !fir.ref>> + %6 = fir.load %1#0 : !fir.ref + %7 = fir.convert %6 : (i32) -> index + %8 = arith.cmpi sgt, %7, %c0 : index + %9 = arith.select %8, %7, %c0 : index + %10 = fir.shape %9 : (index) -> !fir.shape<1> + %11:2 = hlfir.declare %5(%10) typeparams %4#1 dummy_scope %0 {uniq_name = "_QFeoshift7cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %12 = hlfir.eoshift %11#0 %c2_i32 boundary %3#0 : (!fir.box>>, i32, !fir.boxchar<1>) -> !hlfir.expr> + hlfir.assign %12 to %11#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %12 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift7c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift7cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_5:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]]#0 typeparams %[[VAL_5]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift7cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<1>, !fir.ref>) +// CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index +// CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_2]] : index +// CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_2]] : index +// CHECK: %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_13]]) typeparams %[[VAL_7]]#1 dummy_scope %[[VAL_3]] {uniq_name = "_QFeoshift7cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_12]] : (index) -> i64 +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_17:.*]] = hlfir.elemental %[[VAL_13]] typeparams %[[VAL_7]]#1 unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_18:.*]]: index): +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (index) -> i64 +// CHECK: %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_16]] overflow : i64 +// CHECK: %[[VAL_21:.*]] = arith.cmpi sge, %[[VAL_20]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_22:.*]] = arith.cmpi sle, %[[VAL_20]], %[[VAL_15]] : i64 +// CHECK: %[[VAL_23:.*]] = arith.andi %[[VAL_21]], %[[VAL_22]] : i1 +// CHECK: %[[VAL_24:.*]] = fir.if %[[VAL_23]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_20]] : (i64) -> index +// CHECK: %[[VAL_26:.*]] = hlfir.designate %[[VAL_14]]#0 (%[[VAL_25]]) typeparams %[[VAL_7]]#1 : (!fir.box>>, index, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_26]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_6]]#0 : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_24]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_17]] to %[[VAL_14]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_17]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the scalar optional boundary. +// ! CHARACTER with constant length. +// subroutine eoshift8c(n, array, boundary) +// integer :: n +// character(10,2) :: array(n) +// character(10,2), optional :: boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift8c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<2> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<2> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift8cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg2 : (!fir.boxchar<2>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref> + %4:2 = hlfir.declare %3 typeparams %c10 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift8cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %5:2 = fir.unboxchar %arg1 : (!fir.boxchar<2>) -> (!fir.ref>, index) + %6 = fir.convert %5#0 : (!fir.ref>) -> !fir.ref>> + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.shape %10 : (index) -> !fir.shape<1> + %12:2 = hlfir.declare %6(%11) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift8cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %13 = fir.is_present %4#0 : (!fir.ref>) -> i1 + %14 = fir.embox %4#0 : (!fir.ref>) -> !fir.box> + %15 = fir.absent !fir.box> + %16 = arith.select %13, %14, %15 : !fir.box> + %17 = hlfir.eoshift %12#0 %c2_i32 boundary %16 : (!fir.box>>, i32, !fir.box>) -> !hlfir.expr> + hlfir.assign %17 to %12#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %17 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift8c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<2> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift8cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<2>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref> +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift8cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.ref>, !fir.ref>) +// CHECK: %[[VAL_9:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref>, index) +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_15:.*]] = fir.shape %[[VAL_14]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_15]]) typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift8cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_17:.*]] = fir.is_present %[[VAL_8]]#0 : (!fir.ref>) -> i1 +// CHECK: %[[VAL_18:.*]] = fir.embox %[[VAL_8]]#0 : (!fir.ref>) -> !fir.box> +// CHECK: %[[VAL_19:.*]] = fir.absent !fir.box> +// CHECK: %[[VAL_20:.*]] = arith.select %[[VAL_17]], %[[VAL_18]], %[[VAL_19]] : !fir.box> +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_23:.*]] = fir.is_present %[[VAL_20]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_24:.*]] = fir.if %[[VAL_23]] -> (!fir.boxchar<2>) { +// CHECK: %[[VAL_25:.*]] = fir.box_addr %[[VAL_20]] : (!fir.box>) -> !fir.ref> +// CHECK: %[[VAL_26:.*]] = fir.emboxchar %[[VAL_25]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_26]] : !fir.boxchar<2> +// CHECK: } else { +// CHECK: %[[VAL_27:.*]] = fir.alloca !fir.char<2,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_28:.*]] = fir.emboxchar %[[VAL_27]], %[[VAL_2]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_28]] : !fir.boxchar<2> +// CHECK: } +// CHECK: %[[VAL_29:.*]] = hlfir.elemental %[[VAL_15]] typeparams %[[VAL_3]] unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_30:.*]]: index): +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (index) -> i64 +// CHECK: %[[VAL_32:.*]] = arith.addi %[[VAL_31]], %[[VAL_22]] overflow : i64 +// CHECK: %[[VAL_33:.*]] = arith.cmpi sge, %[[VAL_32]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_34:.*]] = arith.cmpi sle, %[[VAL_32]], %[[VAL_21]] : i64 +// CHECK: %[[VAL_35:.*]] = arith.andi %[[VAL_33]], %[[VAL_34]] : i1 +// CHECK: %[[VAL_36:.*]] = fir.if %[[VAL_35]] -> (!fir.boxchar<2>) { +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_32]] : (i64) -> index +// CHECK: %[[VAL_38:.*]] = hlfir.designate %[[VAL_16]]#0 (%[[VAL_37]]) typeparams %[[VAL_3]] : (!fir.box>>, index, index) -> !fir.ref> +// CHECK: %[[VAL_39:.*]] = fir.emboxchar %[[VAL_38]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_39]] : !fir.boxchar<2> +// CHECK: } else { +// CHECK: fir.result %[[VAL_24]] : !fir.boxchar<2> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_36]] : !fir.boxchar<2> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_29]] to %[[VAL_16]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_29]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the scalar optional boundary. +// ! CHARACTER with variable length. +// subroutine eoshift9c(n, array, boundary) +// integer :: n +// character(n,2) :: array(n) +// character(n,2), optional :: boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift9c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<2> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<2> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift9cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<2>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = arith.cmpi sgt, %4, %c0_i32 : i32 + %6 = arith.select %5, %4, %c0_i32 : i32 + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.shape %10 : (index) -> !fir.shape<1> + %12:2 = hlfir.declare %3(%11) typeparams %6 dummy_scope %0 {uniq_name = "_QFeoshift9cEarray"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %13:2 = fir.unboxchar %arg2 : (!fir.boxchar<2>) -> (!fir.ref>, index) + %14 = fir.load %1#0 : !fir.ref + %15 = arith.cmpi sgt, %14, %c0_i32 : i32 + %16 = arith.select %15, %14, %c0_i32 : i32 + %17:2 = hlfir.declare %13#0 typeparams %16 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift9cEboundary"} : (!fir.ref>, i32, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref>) + %18 = fir.is_present %17#0 : (!fir.boxchar<2>) -> i1 + %19 = fir.embox %17#1 typeparams %16 : (!fir.ref>, i32) -> !fir.box> + %20 = fir.absent !fir.box> + %21 = arith.select %18, %19, %20 : !fir.box> + %22 = hlfir.eoshift %12#0 %c2_i32 boundary %21 : (!fir.box>>, i32, !fir.box>) -> !hlfir.expr> + hlfir.assign %22 to %12#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %22 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift9c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<2> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_5:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift9cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref>, index) +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref +// CHECK: %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_4]] : i32 +// CHECK: %[[VAL_11:.*]] = arith.select %[[VAL_10]], %[[VAL_9]], %[[VAL_4]] : i32 +// CHECK: %[[VAL_12:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref +// CHECK: %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> index +// CHECK: %[[VAL_14:.*]] = arith.cmpi sgt, %[[VAL_13]], %[[VAL_3]] : index +// CHECK: %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_13]], %[[VAL_3]] : index +// CHECK: %[[VAL_16:.*]] = fir.shape %[[VAL_15]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_16]]) typeparams %[[VAL_11]] dummy_scope %[[VAL_5]] {uniq_name = "_QFeoshift9cEarray"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_18:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<2>) -> (!fir.ref>, index) +// CHECK: %[[VAL_19:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref +// CHECK: %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_19]], %[[VAL_4]] : i32 +// CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_19]], %[[VAL_4]] : i32 +// CHECK: %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]]#0 typeparams %[[VAL_21]] dummy_scope %[[VAL_5]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift9cEboundary"} : (!fir.ref>, i32, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref>) +// CHECK: %[[VAL_23:.*]] = fir.is_present %[[VAL_22]]#0 : (!fir.boxchar<2>) -> i1 +// CHECK: %[[VAL_24:.*]] = fir.embox %[[VAL_22]]#1 typeparams %[[VAL_21]] : (!fir.ref>, i32) -> !fir.box> +// CHECK: %[[VAL_25:.*]] = fir.absent !fir.box> +// CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_23]], %[[VAL_24]], %[[VAL_25]] : !fir.box> +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_15]] : (index) -> i64 +// CHECK: %[[VAL_28:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 +// CHECK: %[[VAL_29:.*]] = fir.is_present %[[VAL_26]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_30:.*]] = fir.if %[[VAL_29]] -> (!fir.boxchar<2>) { +// CHECK: %[[VAL_31:.*]] = fir.box_addr %[[VAL_26]] : (!fir.box>) -> !fir.ref> +// CHECK: %[[VAL_32:.*]] = fir.box_elesize %[[VAL_26]] : (!fir.box>) -> index +// CHECK: %[[VAL_33:.*]] = arith.divsi %[[VAL_32]], %[[VAL_1]] : index +// CHECK: %[[VAL_34:.*]] = fir.emboxchar %[[VAL_31]], %[[VAL_33]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_34]] : !fir.boxchar<2> +// CHECK: } else { +// CHECK: %[[VAL_35:.*]] = fir.alloca !fir.char<2,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_36:.*]] = fir.emboxchar %[[VAL_35]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_36]] : !fir.boxchar<2> +// CHECK: } +// CHECK: %[[VAL_37:.*]] = hlfir.elemental %[[VAL_16]] typeparams %[[VAL_11]] unordered : (!fir.shape<1>, i32) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_38:.*]]: index): +// CHECK: %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (index) -> i64 +// CHECK: %[[VAL_40:.*]] = arith.addi %[[VAL_39]], %[[VAL_28]] overflow : i64 +// CHECK: %[[VAL_41:.*]] = arith.cmpi sge, %[[VAL_40]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_42:.*]] = arith.cmpi sle, %[[VAL_40]], %[[VAL_27]] : i64 +// CHECK: %[[VAL_43:.*]] = arith.andi %[[VAL_41]], %[[VAL_42]] : i1 +// CHECK: %[[VAL_44:.*]] = fir.if %[[VAL_43]] -> (!fir.boxchar<2>) { +// CHECK: %[[VAL_45:.*]] = fir.convert %[[VAL_40]] : (i64) -> index +// CHECK: %[[VAL_46:.*]] = hlfir.designate %[[VAL_17]]#0 (%[[VAL_45]]) typeparams %[[VAL_11]] : (!fir.box>>, index, i32) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_46]] : !fir.boxchar<2> +// CHECK: } else { +// CHECK: fir.result %[[VAL_30]] : !fir.boxchar<2> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_44]] : !fir.boxchar<2> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_37]] to %[[VAL_17]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_37]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the scalar optional boundary. +// ! CHARACTER with assumed length. +// subroutine eoshift10c(n, array, boundary) +// integer :: n +// character(*,2) :: array(n) +// character(*,2), optional :: boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift10c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<2> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<2> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift10cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg2 : (!fir.boxchar<2>) -> (!fir.ref>, index) + %3:2 = hlfir.declare %2#0 typeparams %2#1 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift10cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref>) + %4:2 = fir.unboxchar %arg1 : (!fir.boxchar<2>) -> (!fir.ref>, index) + %5 = fir.convert %4#0 : (!fir.ref>) -> !fir.ref>> + %6 = fir.load %1#0 : !fir.ref + %7 = fir.convert %6 : (i32) -> index + %8 = arith.cmpi sgt, %7, %c0 : index + %9 = arith.select %8, %7, %c0 : index + %10 = fir.shape %9 : (index) -> !fir.shape<1> + %11:2 = hlfir.declare %5(%10) typeparams %4#1 dummy_scope %0 {uniq_name = "_QFeoshift10cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %12 = fir.is_present %3#0 : (!fir.boxchar<2>) -> i1 + %13 = fir.embox %3#1 typeparams %2#1 : (!fir.ref>, index) -> !fir.box> + %14 = fir.absent !fir.box> + %15 = arith.select %12, %13, %14 : !fir.box> + %16 = hlfir.eoshift %11#0 %c2_i32 boundary %15 : (!fir.box>>, i32, !fir.box>) -> !hlfir.expr> + hlfir.assign %16 to %11#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %16 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift10c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<2> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<2> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift10cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<2>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]]#0 typeparams %[[VAL_6]]#1 dummy_scope %[[VAL_4]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift10cEboundary"} : (!fir.ref>, index, !fir.dscope) -> (!fir.boxchar<2>, !fir.ref>) +// CHECK: %[[VAL_8:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<2>) -> (!fir.ref>, index) +// CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> index +// CHECK: %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_3]] : index +// CHECK: %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_3]] : index +// CHECK: %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_14]]) typeparams %[[VAL_8]]#1 dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift10cEarray"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_16:.*]] = fir.is_present %[[VAL_7]]#0 : (!fir.boxchar<2>) -> i1 +// CHECK: %[[VAL_17:.*]] = fir.embox %[[VAL_7]]#1 typeparams %[[VAL_6]]#1 : (!fir.ref>, index) -> !fir.box> +// CHECK: %[[VAL_18:.*]] = fir.absent !fir.box> +// CHECK: %[[VAL_19:.*]] = arith.select %[[VAL_16]], %[[VAL_17]], %[[VAL_18]] : !fir.box> +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_13]] : (index) -> i64 +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 +// CHECK: %[[VAL_22:.*]] = fir.is_present %[[VAL_19]] : (!fir.box>) -> i1 +// CHECK: %[[VAL_23:.*]] = fir.if %[[VAL_22]] -> (!fir.boxchar<2>) { +// CHECK: %[[VAL_24:.*]] = fir.box_addr %[[VAL_19]] : (!fir.box>) -> !fir.ref> +// CHECK: %[[VAL_25:.*]] = fir.box_elesize %[[VAL_19]] : (!fir.box>) -> index +// CHECK: %[[VAL_26:.*]] = arith.divsi %[[VAL_25]], %[[VAL_1]] : index +// CHECK: %[[VAL_27:.*]] = fir.emboxchar %[[VAL_24]], %[[VAL_26]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_27]] : !fir.boxchar<2> +// CHECK: } else { +// CHECK: %[[VAL_28:.*]] = fir.alloca !fir.char<2,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_29:.*]] = fir.emboxchar %[[VAL_28]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_29]] : !fir.boxchar<2> +// CHECK: } +// CHECK: %[[VAL_30:.*]] = hlfir.elemental %[[VAL_14]] typeparams %[[VAL_8]]#1 unordered : (!fir.shape<1>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_31:.*]]: index): +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (index) -> i64 +// CHECK: %[[VAL_33:.*]] = arith.addi %[[VAL_32]], %[[VAL_21]] overflow : i64 +// CHECK: %[[VAL_34:.*]] = arith.cmpi sge, %[[VAL_33]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_35:.*]] = arith.cmpi sle, %[[VAL_33]], %[[VAL_20]] : i64 +// CHECK: %[[VAL_36:.*]] = arith.andi %[[VAL_34]], %[[VAL_35]] : i1 +// CHECK: %[[VAL_37:.*]] = fir.if %[[VAL_36]] -> (!fir.boxchar<2>) { +// CHECK: %[[VAL_38:.*]] = fir.convert %[[VAL_33]] : (i64) -> index +// CHECK: %[[VAL_39:.*]] = hlfir.designate %[[VAL_15]]#0 (%[[VAL_38]]) typeparams %[[VAL_8]]#1 : (!fir.box>>, index, index) -> !fir.boxchar<2> +// CHECK: fir.result %[[VAL_39]] : !fir.boxchar<2> +// CHECK: } else { +// CHECK: fir.result %[[VAL_23]] : !fir.boxchar<2> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_37]] : !fir.boxchar<2> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_30]] to %[[VAL_15]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_30]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array always present boundary. +// ! CHARACTER with constant length. +// subroutine eoshift11c(n, array, boundary) +// integer :: n +// character(10,4) :: array(n,n), boundary(:) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift11c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<4> {fir.bindc_name = "array"}, %arg2: !fir.box>> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift11cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg2 typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift11cEboundary"} : (!fir.box>>, index, !fir.dscope) -> (!fir.box>>, !fir.box>>) + %3:2 = fir.unboxchar %arg1 : (!fir.boxchar<4>) -> (!fir.ref>, index) + %4 = fir.convert %3#0 : (!fir.ref>) -> !fir.ref>> + %5 = fir.load %1#0 : !fir.ref + %6 = fir.convert %5 : (i32) -> index + %7 = arith.cmpi sgt, %6, %c0 : index + %8 = arith.select %7, %6, %c0 : index + %9 = fir.load %1#0 : !fir.ref + %10 = fir.convert %9 : (i32) -> index + %11 = arith.cmpi sgt, %10, %c0 : index + %12 = arith.select %11, %10, %c0 : index + %13 = fir.shape %8, %12 : (index, index) -> !fir.shape<2> + %14:2 = hlfir.declare %4(%13) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift11cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %15 = hlfir.eoshift %14#0 %c2_i32 boundary %2#0 : (!fir.box>>, i32, !fir.box>>) -> !hlfir.expr> + hlfir.assign %15 to %14#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %15 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift11c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.box>> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift11cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG2]] typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift11cEboundary"} : (!fir.box>>, index, !fir.dscope) -> (!fir.box>>, !fir.box>>) +// CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref>, index) +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index +// CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_2]] : index +// CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_2]] : index +// CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> index +// CHECK: %[[VAL_15:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_2]] : index +// CHECK: %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_14]], %[[VAL_2]] : index +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_12]], %[[VAL_16]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_17]]) typeparams %[[VAL_3]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift11cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_12]] : (index) -> i64 +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_21:.*]] = hlfir.elemental %[[VAL_17]] typeparams %[[VAL_3]] unordered : (!fir.shape<2>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_22:.*]]: index, %[[VAL_23:.*]]: index): +// CHECK: %[[VAL_24:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_23]]) typeparams %[[VAL_3]] : (!fir.box>>, index, index) -> !fir.ref> +// CHECK: %[[VAL_25:.*]] = fir.emboxchar %[[VAL_24]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<4> +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_22]] : (index) -> i64 +// CHECK: %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_20]] overflow : i64 +// CHECK: %[[VAL_28:.*]] = arith.cmpi sge, %[[VAL_27]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_29:.*]] = arith.cmpi sle, %[[VAL_27]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_30:.*]] = arith.andi %[[VAL_28]], %[[VAL_29]] : i1 +// CHECK: %[[VAL_31:.*]] = fir.if %[[VAL_30]] -> (!fir.boxchar<4>) { +// CHECK: %[[VAL_32:.*]] = fir.convert %[[VAL_27]] : (i64) -> index +// CHECK: %[[VAL_33:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_32]], %[[VAL_23]]) typeparams %[[VAL_3]] : (!fir.box>>, index, index, index) -> !fir.ref> +// CHECK: %[[VAL_34:.*]] = fir.emboxchar %[[VAL_33]], %[[VAL_3]] : (!fir.ref>, index) -> !fir.boxchar<4> +// CHECK: fir.result %[[VAL_34]] : !fir.boxchar<4> +// CHECK: } else { +// CHECK: fir.result %[[VAL_25]] : !fir.boxchar<4> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_31]] : !fir.boxchar<4> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_21]] to %[[VAL_18]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_21]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array always present boundary. +// ! CHARACTER with variable length. +// subroutine eoshift12c(n, array, boundary) +// integer :: n +// character(n,4) :: array(n,n), boundary(:) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift12c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<4> {fir.bindc_name = "array"}, %arg2: !fir.box>> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift12cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<4>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = arith.cmpi sgt, %4, %c0_i32 : i32 + %6 = arith.select %5, %4, %c0_i32 : i32 + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.load %1#0 : !fir.ref + %12 = fir.convert %11 : (i32) -> index + %13 = arith.cmpi sgt, %12, %c0 : index + %14 = arith.select %13, %12, %c0 : index + %15 = fir.shape %10, %14 : (index, index) -> !fir.shape<2> + %16:2 = hlfir.declare %3(%15) typeparams %6 dummy_scope %0 {uniq_name = "_QFeoshift12cEarray"} : (!fir.ref>>, !fir.shape<2>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %17 = fir.load %1#0 : !fir.ref + %18 = arith.cmpi sgt, %17, %c0_i32 : i32 + %19 = arith.select %18, %17, %c0_i32 : i32 + %20:2 = hlfir.declare %arg2 typeparams %19 dummy_scope %0 {uniq_name = "_QFeoshift12cEboundary"} : (!fir.box>>, i32, !fir.dscope) -> (!fir.box>>, !fir.box>>) + %21 = hlfir.eoshift %16#0 %c2_i32 boundary %20#0 : (!fir.box>>, i32, !fir.box>>) -> !hlfir.expr> + hlfir.assign %21 to %16#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %21 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift12c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.box>> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift12cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref>, index) +// CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_8:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_10:.*]] = arith.select %[[VAL_9]], %[[VAL_8]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_2]] : index +// CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> index +// CHECK: %[[VAL_17:.*]] = arith.cmpi sgt, %[[VAL_16]], %[[VAL_2]] : index +// CHECK: %[[VAL_18:.*]] = arith.select %[[VAL_17]], %[[VAL_16]], %[[VAL_2]] : index +// CHECK: %[[VAL_19:.*]] = fir.shape %[[VAL_14]], %[[VAL_18]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_7]](%[[VAL_19]]) typeparams %[[VAL_10]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift12cEarray"} : (!fir.ref>>, !fir.shape<2>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_21:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_22:.*]] = arith.cmpi sgt, %[[VAL_21]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_23:.*]] = arith.select %[[VAL_22]], %[[VAL_21]], %[[VAL_3]] : i32 +// CHECK: %[[VAL_24:.*]]:2 = hlfir.declare %[[ARG2]] typeparams %[[VAL_23]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift12cEboundary"} : (!fir.box>>, i32, !fir.dscope) -> (!fir.box>>, !fir.box>>) +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 +// CHECK: %[[VAL_26:.*]] = fir.convert %[[VAL_1]] : (i32) -> i64 +// CHECK: %[[VAL_27:.*]] = hlfir.elemental %[[VAL_19]] typeparams %[[VAL_10]] unordered : (!fir.shape<2>, i32) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_28:.*]]: index, %[[VAL_29:.*]]: index): +// CHECK: %[[VAL_30:.*]] = hlfir.designate %[[VAL_24]]#0 (%[[VAL_29]]) typeparams %[[VAL_23]] : (!fir.box>>, index, i32) -> !fir.boxchar<4> +// CHECK: %[[VAL_31:.*]] = fir.convert %[[VAL_28]] : (index) -> i64 +// CHECK: %[[VAL_32:.*]] = arith.addi %[[VAL_31]], %[[VAL_26]] overflow : i64 +// CHECK: %[[VAL_33:.*]] = arith.cmpi sge, %[[VAL_32]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_34:.*]] = arith.cmpi sle, %[[VAL_32]], %[[VAL_25]] : i64 +// CHECK: %[[VAL_35:.*]] = arith.andi %[[VAL_33]], %[[VAL_34]] : i1 +// CHECK: %[[VAL_36:.*]] = fir.if %[[VAL_35]] -> (!fir.boxchar<4>) { +// CHECK: %[[VAL_37:.*]] = fir.convert %[[VAL_32]] : (i64) -> index +// CHECK: %[[VAL_38:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_37]], %[[VAL_29]]) typeparams %[[VAL_10]] : (!fir.box>>, index, index, i32) -> !fir.boxchar<4> +// CHECK: fir.result %[[VAL_38]] : !fir.boxchar<4> +// CHECK: } else { +// CHECK: fir.result %[[VAL_30]] : !fir.boxchar<4> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_36]] : !fir.boxchar<4> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_27]] to %[[VAL_20]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_27]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array always present boundary. +// ! CHARACTER with assumed length. +// subroutine eoshift13c(n, array, boundary) +// integer :: n +// character(*,4) :: array(n,n), boundary(:) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift13c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<4> {fir.bindc_name = "array"}, %arg2: !fir.box>> {fir.bindc_name = "boundary"}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift13cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg2 dummy_scope %0 {uniq_name = "_QFeoshift13cEboundary"} : (!fir.box>>, !fir.dscope) -> (!fir.box>>, !fir.box>>) + %3:2 = fir.unboxchar %arg1 : (!fir.boxchar<4>) -> (!fir.ref>, index) + %4 = fir.convert %3#0 : (!fir.ref>) -> !fir.ref>> + %5 = fir.load %1#0 : !fir.ref + %6 = fir.convert %5 : (i32) -> index + %7 = arith.cmpi sgt, %6, %c0 : index + %8 = arith.select %7, %6, %c0 : index + %9 = fir.load %1#0 : !fir.ref + %10 = fir.convert %9 : (i32) -> index + %11 = arith.cmpi sgt, %10, %c0 : index + %12 = arith.select %11, %10, %c0 : index + %13 = fir.shape %8, %12 : (index, index) -> !fir.shape<2> + %14:2 = hlfir.declare %4(%13) typeparams %3#1 dummy_scope %0 {uniq_name = "_QFeoshift13cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %15 = hlfir.eoshift %14#0 %c2_i32 boundary %2#0 : (!fir.box>>, i32, !fir.box>>) -> !hlfir.expr> + hlfir.assign %15 to %14#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %15 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift13c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<4> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.box>> {fir.bindc_name = "boundary"}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 4 : index +// CHECK: %[[VAL_2:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_3:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_4:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_5:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift13cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_6:.*]]:2 = hlfir.declare %[[ARG2]] dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift13cEboundary"} : (!fir.box>>, !fir.dscope) -> (!fir.box>>, !fir.box>>) +// CHECK: %[[VAL_7:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<4>) -> (!fir.ref>, index) +// CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_9:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> index +// CHECK: %[[VAL_11:.*]] = arith.cmpi sgt, %[[VAL_10]], %[[VAL_3]] : index +// CHECK: %[[VAL_12:.*]] = arith.select %[[VAL_11]], %[[VAL_10]], %[[VAL_3]] : index +// CHECK: %[[VAL_13:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref +// CHECK: %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> index +// CHECK: %[[VAL_15:.*]] = arith.cmpi sgt, %[[VAL_14]], %[[VAL_3]] : index +// CHECK: %[[VAL_16:.*]] = arith.select %[[VAL_15]], %[[VAL_14]], %[[VAL_3]] : index +// CHECK: %[[VAL_17:.*]] = fir.shape %[[VAL_12]], %[[VAL_16]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_17]]) typeparams %[[VAL_7]]#1 dummy_scope %[[VAL_4]] {uniq_name = "_QFeoshift13cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_12]] : (index) -> i64 +// CHECK: %[[VAL_20:.*]] = fir.convert %[[VAL_2]] : (i32) -> i64 +// CHECK: %[[VAL_21:.*]] = hlfir.elemental %[[VAL_17]] typeparams %[[VAL_7]]#1 unordered : (!fir.shape<2>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_22:.*]]: index, %[[VAL_23:.*]]: index): +// CHECK: %[[VAL_24:.*]] = fir.box_elesize %[[VAL_6]]#1 : (!fir.box>>) -> index +// CHECK: %[[VAL_25:.*]] = arith.divsi %[[VAL_24]], %[[VAL_1]] : index +// CHECK: %[[VAL_26:.*]] = hlfir.designate %[[VAL_6]]#0 (%[[VAL_23]]) typeparams %[[VAL_25]] : (!fir.box>>, index, index) -> !fir.boxchar<4> +// CHECK: %[[VAL_27:.*]] = fir.convert %[[VAL_22]] : (index) -> i64 +// CHECK: %[[VAL_28:.*]] = arith.addi %[[VAL_27]], %[[VAL_20]] overflow : i64 +// CHECK: %[[VAL_29:.*]] = arith.cmpi sge, %[[VAL_28]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_30:.*]] = arith.cmpi sle, %[[VAL_28]], %[[VAL_19]] : i64 +// CHECK: %[[VAL_31:.*]] = arith.andi %[[VAL_29]], %[[VAL_30]] : i1 +// CHECK: %[[VAL_32:.*]] = fir.if %[[VAL_31]] -> (!fir.boxchar<4>) { +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_28]] : (i64) -> index +// CHECK: %[[VAL_34:.*]] = hlfir.designate %[[VAL_18]]#0 (%[[VAL_33]], %[[VAL_23]]) typeparams %[[VAL_7]]#1 : (!fir.box>>, index, index, index) -> !fir.boxchar<4> +// CHECK: fir.result %[[VAL_34]] : !fir.boxchar<4> +// CHECK: } else { +// CHECK: fir.result %[[VAL_26]] : !fir.boxchar<4> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_32]] : !fir.boxchar<4> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_21]] to %[[VAL_18]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_21]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array optional boundary. +// ! CHARACTER with constant length. +// subroutine eoshift14c(n, array, boundary) +// integer :: n +// character(10,1) :: array(n,n) +// character(10,1), optional :: boundary(n) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift14c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<1> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift14cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = fir.convert %4 : (i32) -> index + %6 = arith.cmpi sgt, %5, %c0 : index + %7 = arith.select %6, %5, %c0 : index + %8 = fir.load %1#0 : !fir.ref + %9 = fir.convert %8 : (i32) -> index + %10 = arith.cmpi sgt, %9, %c0 : index + %11 = arith.select %10, %9, %c0 : index + %12 = fir.shape %7, %11 : (index, index) -> !fir.shape<2> + %13:2 = hlfir.declare %3(%12) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift14cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %14:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %15 = fir.convert %14#0 : (!fir.ref>) -> !fir.ref>> + %16 = fir.load %1#0 : !fir.ref + %17 = fir.convert %16 : (i32) -> index + %18 = arith.cmpi sgt, %17, %c0 : index + %19 = arith.select %18, %17, %c0 : index + %20 = fir.shape %19 : (index) -> !fir.shape<1> + %21:2 = hlfir.declare %15(%20) typeparams %c10 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift14cEboundary"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %22 = fir.is_present %21#0 : (!fir.box>>) -> i1 + %23 = fir.shape %19 : (index) -> !fir.shape<1> + %24 = fir.embox %21#1(%23) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> + %25 = fir.absent !fir.box>> + %26 = arith.select %22, %24, %25 : !fir.box>> + %27 = hlfir.eoshift %13#0 %c2_i32 boundary %26 : (!fir.box>>, i32, !fir.box>>) -> !hlfir.expr> + hlfir.assign %27 to %13#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %27 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift14c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant false +// CHECK: %[[VAL_3:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 10 : index +// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift14cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_9:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> index +// CHECK: %[[VAL_13:.*]] = arith.cmpi sgt, %[[VAL_12]], %[[VAL_5]] : index +// CHECK: %[[VAL_14:.*]] = arith.select %[[VAL_13]], %[[VAL_12]], %[[VAL_5]] : index +// CHECK: %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> index +// CHECK: %[[VAL_17:.*]] = arith.cmpi sgt, %[[VAL_16]], %[[VAL_5]] : index +// CHECK: %[[VAL_18:.*]] = arith.select %[[VAL_17]], %[[VAL_16]], %[[VAL_5]] : index +// CHECK: %[[VAL_19:.*]] = fir.shape %[[VAL_14]], %[[VAL_18]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_19]]) typeparams %[[VAL_6]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift14cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_21:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_22:.*]] = fir.convert %[[VAL_21]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_23:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (i32) -> index +// CHECK: %[[VAL_25:.*]] = arith.cmpi sgt, %[[VAL_24]], %[[VAL_5]] : index +// CHECK: %[[VAL_26:.*]] = arith.select %[[VAL_25]], %[[VAL_24]], %[[VAL_5]] : index +// CHECK: %[[VAL_27:.*]] = fir.shape %[[VAL_26]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_22]](%[[VAL_27]]) typeparams %[[VAL_6]] dummy_scope %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift14cEboundary"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_29:.*]] = fir.is_present %[[VAL_28]]#0 : (!fir.box>>) -> i1 +// CHECK: %[[VAL_30:.*]] = fir.shape %[[VAL_26]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_31:.*]] = fir.embox %[[VAL_28]]#1(%[[VAL_30]]) : (!fir.ref>>, !fir.shape<1>) -> !fir.box>> +// CHECK: %[[VAL_32:.*]] = fir.absent !fir.box>> +// CHECK: %[[VAL_33:.*]] = arith.select %[[VAL_29]], %[[VAL_31]], %[[VAL_32]] : !fir.box>> +// CHECK: %[[VAL_34:.*]] = fir.convert %[[VAL_14]] : (index) -> i64 +// CHECK: %[[VAL_35:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64 +// CHECK: %[[VAL_36:.*]] = fir.alloca !fir.char<1,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_37:.*]] = fir.emboxchar %[[VAL_36]], %[[VAL_5]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_38:.*]] = fir.is_present %[[VAL_33]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_38]], %[[VAL_2]], %[[VAL_3]] : i1 +// CHECK: %[[VAL_40:.*]] = hlfir.elemental %[[VAL_19]] typeparams %[[VAL_6]] unordered : (!fir.shape<2>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_41:.*]]: index, %[[VAL_42:.*]]: index): +// CHECK: %[[VAL_43:.*]] = fir.if %[[VAL_39]] -> (!fir.boxchar<1>) { +// CHECK: fir.result %[[VAL_37]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: %[[VAL_44:.*]]:3 = fir.box_dims %[[VAL_33]], %[[VAL_5]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_45:.*]] = arith.subi %[[VAL_44]]#0, %[[VAL_1]] overflow : index +// CHECK: %[[VAL_46:.*]] = arith.addi %[[VAL_42]], %[[VAL_45]] overflow : index +// CHECK: %[[VAL_47:.*]] = hlfir.designate %[[VAL_33]] (%[[VAL_46]]) typeparams %[[VAL_6]] : (!fir.box>>, index, index) -> !fir.ref> +// CHECK: %[[VAL_48:.*]] = fir.emboxchar %[[VAL_47]], %[[VAL_6]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_48]] : !fir.boxchar<1> +// CHECK: } +// CHECK: %[[VAL_49:.*]] = fir.convert %[[VAL_41]] : (index) -> i64 +// CHECK: %[[VAL_50:.*]] = arith.addi %[[VAL_49]], %[[VAL_35]] overflow : i64 +// CHECK: %[[VAL_51:.*]] = arith.cmpi sge, %[[VAL_50]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_52:.*]] = arith.cmpi sle, %[[VAL_50]], %[[VAL_34]] : i64 +// CHECK: %[[VAL_53:.*]] = arith.andi %[[VAL_51]], %[[VAL_52]] : i1 +// CHECK: %[[VAL_54:.*]] = fir.if %[[VAL_53]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_55:.*]] = fir.convert %[[VAL_50]] : (i64) -> index +// CHECK: %[[VAL_56:.*]] = hlfir.designate %[[VAL_20]]#0 (%[[VAL_55]], %[[VAL_42]]) typeparams %[[VAL_6]] : (!fir.box>>, index, index, index) -> !fir.ref> +// CHECK: %[[VAL_57:.*]] = fir.emboxchar %[[VAL_56]], %[[VAL_6]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_57]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_43]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_54]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_40]] to %[[VAL_20]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_40]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array optional boundary. +// ! CHARACTER with variable length. +// subroutine eoshift15c(n, array, boundary) +// integer :: n +// character(n,1) :: array(n,n) +// character(n,1), optional :: boundary(n) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift15c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<1> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift15cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = arith.cmpi sgt, %4, %c0_i32 : i32 + %6 = arith.select %5, %4, %c0_i32 : i32 + %7 = fir.load %1#0 : !fir.ref + %8 = fir.convert %7 : (i32) -> index + %9 = arith.cmpi sgt, %8, %c0 : index + %10 = arith.select %9, %8, %c0 : index + %11 = fir.load %1#0 : !fir.ref + %12 = fir.convert %11 : (i32) -> index + %13 = arith.cmpi sgt, %12, %c0 : index + %14 = arith.select %13, %12, %c0 : index + %15 = fir.shape %10, %14 : (index, index) -> !fir.shape<2> + %16:2 = hlfir.declare %3(%15) typeparams %6 dummy_scope %0 {uniq_name = "_QFeoshift15cEarray"} : (!fir.ref>>, !fir.shape<2>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %17:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %18 = fir.convert %17#0 : (!fir.ref>) -> !fir.ref>> + %19 = fir.load %1#0 : !fir.ref + %20 = arith.cmpi sgt, %19, %c0_i32 : i32 + %21 = arith.select %20, %19, %c0_i32 : i32 + %22 = fir.load %1#0 : !fir.ref + %23 = fir.convert %22 : (i32) -> index + %24 = arith.cmpi sgt, %23, %c0 : index + %25 = arith.select %24, %23, %c0 : index + %26 = fir.shape %25 : (index) -> !fir.shape<1> + %27:2 = hlfir.declare %18(%26) typeparams %21 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift15cEboundary"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %28 = fir.is_present %27#0 : (!fir.box>>) -> i1 + %29 = fir.shape %25 : (index) -> !fir.shape<1> + %30 = fir.embox %27#1(%29) typeparams %21 : (!fir.ref>>, !fir.shape<1>, i32) -> !fir.box>> + %31 = fir.absent !fir.box>> + %32 = arith.select %28, %30, %31 : !fir.box>> + %33 = hlfir.eoshift %16#0 %c2_i32 boundary %32 : (!fir.box>>, i32, !fir.box>>) -> !hlfir.expr> + hlfir.assign %33 to %16#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %33 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift15c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant false +// CHECK: %[[VAL_3:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = arith.constant 0 : i32 +// CHECK: %[[VAL_7:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift15cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_9:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_11:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_6]] : i32 +// CHECK: %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_6]] : i32 +// CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> index +// CHECK: %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_18:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> index +// CHECK: %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_19]], %[[VAL_5]] : index +// CHECK: %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_19]], %[[VAL_5]] : index +// CHECK: %[[VAL_22:.*]] = fir.shape %[[VAL_17]], %[[VAL_21]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_22]]) typeparams %[[VAL_13]] dummy_scope %[[VAL_7]] {uniq_name = "_QFeoshift15cEarray"} : (!fir.ref>>, !fir.shape<2>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_24:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_25:.*]] = fir.convert %[[VAL_24]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_26:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_27:.*]] = arith.cmpi sgt, %[[VAL_26]], %[[VAL_6]] : i32 +// CHECK: %[[VAL_28:.*]] = arith.select %[[VAL_27]], %[[VAL_26]], %[[VAL_6]] : i32 +// CHECK: %[[VAL_29:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref +// CHECK: %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (i32) -> index +// CHECK: %[[VAL_31:.*]] = arith.cmpi sgt, %[[VAL_30]], %[[VAL_5]] : index +// CHECK: %[[VAL_32:.*]] = arith.select %[[VAL_31]], %[[VAL_30]], %[[VAL_5]] : index +// CHECK: %[[VAL_33:.*]] = fir.shape %[[VAL_32]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_34:.*]]:2 = hlfir.declare %[[VAL_25]](%[[VAL_33]]) typeparams %[[VAL_28]] dummy_scope %[[VAL_7]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift15cEboundary"} : (!fir.ref>>, !fir.shape<1>, i32, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_35:.*]] = fir.is_present %[[VAL_34]]#0 : (!fir.box>>) -> i1 +// CHECK: %[[VAL_36:.*]] = fir.shape %[[VAL_32]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_37:.*]] = fir.embox %[[VAL_34]]#1(%[[VAL_36]]) typeparams %[[VAL_28]] : (!fir.ref>>, !fir.shape<1>, i32) -> !fir.box>> +// CHECK: %[[VAL_38:.*]] = fir.absent !fir.box>> +// CHECK: %[[VAL_39:.*]] = arith.select %[[VAL_35]], %[[VAL_37]], %[[VAL_38]] : !fir.box>> +// CHECK: %[[VAL_40:.*]] = fir.convert %[[VAL_17]] : (index) -> i64 +// CHECK: %[[VAL_41:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64 +// CHECK: %[[VAL_42:.*]] = fir.alloca !fir.char<1,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_43:.*]] = fir.emboxchar %[[VAL_42]], %[[VAL_5]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_44:.*]] = fir.is_present %[[VAL_39]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_45:.*]] = arith.select %[[VAL_44]], %[[VAL_2]], %[[VAL_3]] : i1 +// CHECK: %[[VAL_46:.*]] = hlfir.elemental %[[VAL_22]] typeparams %[[VAL_13]] unordered : (!fir.shape<2>, i32) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_47:.*]]: index, %[[VAL_48:.*]]: index): +// CHECK: %[[VAL_49:.*]] = fir.if %[[VAL_45]] -> (!fir.boxchar<1>) { +// CHECK: fir.result %[[VAL_43]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: %[[VAL_50:.*]] = fir.box_elesize %[[VAL_39]] : (!fir.box>>) -> index +// CHECK: %[[VAL_51:.*]]:3 = fir.box_dims %[[VAL_39]], %[[VAL_5]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_52:.*]] = arith.subi %[[VAL_51]]#0, %[[VAL_1]] overflow : index +// CHECK: %[[VAL_53:.*]] = arith.addi %[[VAL_48]], %[[VAL_52]] overflow : index +// CHECK: %[[VAL_54:.*]] = hlfir.designate %[[VAL_39]] (%[[VAL_53]]) typeparams %[[VAL_50]] : (!fir.box>>, index, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_54]] : !fir.boxchar<1> +// CHECK: } +// CHECK: %[[VAL_55:.*]] = fir.convert %[[VAL_47]] : (index) -> i64 +// CHECK: %[[VAL_56:.*]] = arith.addi %[[VAL_55]], %[[VAL_41]] overflow : i64 +// CHECK: %[[VAL_57:.*]] = arith.cmpi sge, %[[VAL_56]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_58:.*]] = arith.cmpi sle, %[[VAL_56]], %[[VAL_40]] : i64 +// CHECK: %[[VAL_59:.*]] = arith.andi %[[VAL_57]], %[[VAL_58]] : i1 +// CHECK: %[[VAL_60:.*]] = fir.if %[[VAL_59]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_61:.*]] = fir.convert %[[VAL_56]] : (i64) -> index +// CHECK: %[[VAL_62:.*]] = hlfir.designate %[[VAL_23]]#0 (%[[VAL_61]], %[[VAL_48]]) typeparams %[[VAL_13]] : (!fir.box>>, index, index, i32) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_62]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_49]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_60]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_46]] to %[[VAL_23]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_46]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! Test contiguous 1D array with the array optional boundary. +// ! CHARACTER with assumed length. +// subroutine eoshift16c(n, array, boundary) +// integer :: n +// character(*,1) :: array(n,n) +// character(*,1), optional :: boundary(n) +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift16c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}, %arg2: !fir.boxchar<1> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift16cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = fir.convert %4 : (i32) -> index + %6 = arith.cmpi sgt, %5, %c0 : index + %7 = arith.select %6, %5, %c0 : index + %8 = fir.load %1#0 : !fir.ref + %9 = fir.convert %8 : (i32) -> index + %10 = arith.cmpi sgt, %9, %c0 : index + %11 = arith.select %10, %9, %c0 : index + %12 = fir.shape %7, %11 : (index, index) -> !fir.shape<2> + %13:2 = hlfir.declare %3(%12) typeparams %2#1 dummy_scope %0 {uniq_name = "_QFeoshift16cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %14:2 = fir.unboxchar %arg2 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %15 = fir.convert %14#0 : (!fir.ref>) -> !fir.ref>> + %16 = fir.load %1#0 : !fir.ref + %17 = fir.convert %16 : (i32) -> index + %18 = arith.cmpi sgt, %17, %c0 : index + %19 = arith.select %18, %17, %c0 : index + %20 = fir.shape %19 : (index) -> !fir.shape<1> + %21:2 = hlfir.declare %15(%20) typeparams %14#1 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift16cEboundary"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %22 = fir.is_present %21#0 : (!fir.box>>) -> i1 + %23 = fir.shape %19 : (index) -> !fir.shape<1> + %24 = fir.embox %21#1(%23) typeparams %14#1 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.box>> + %25 = fir.absent !fir.box>> + %26 = arith.select %22, %24, %25 : !fir.box>> + %27 = hlfir.eoshift %13#0 %c2_i32 boundary %26 : (!fir.box>>, i32, !fir.box>>) -> !hlfir.expr> + hlfir.assign %27 to %13#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %27 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift16c( +// CHECK-SAME: %[[ARG0:.*]]: !fir.ref {fir.bindc_name = "n"}, +// CHECK-SAME: %[[ARG1:.*]]: !fir.boxchar<1> {fir.bindc_name = "array"}, +// CHECK-SAME: %[[ARG2:.*]]: !fir.boxchar<1> {fir.bindc_name = "boundary", fir.optional}) { +// CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 +// CHECK: %[[VAL_1:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_2:.*]] = arith.constant false +// CHECK: %[[VAL_3:.*]] = arith.constant true +// CHECK: %[[VAL_4:.*]] = arith.constant 2 : i32 +// CHECK: %[[VAL_5:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_6:.*]] = fir.dummy_scope : !fir.dscope +// CHECK: %[[VAL_7:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[VAL_6]] {uniq_name = "_QFeoshift16cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) +// CHECK: %[[VAL_8:.*]]:2 = fir.unboxchar %[[ARG1]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_10:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref +// CHECK: %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> index +// CHECK: %[[VAL_12:.*]] = arith.cmpi sgt, %[[VAL_11]], %[[VAL_5]] : index +// CHECK: %[[VAL_13:.*]] = arith.select %[[VAL_12]], %[[VAL_11]], %[[VAL_5]] : index +// CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref +// CHECK: %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> index +// CHECK: %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_18:.*]] = fir.shape %[[VAL_13]], %[[VAL_17]] : (index, index) -> !fir.shape<2> +// CHECK: %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_9]](%[[VAL_18]]) typeparams %[[VAL_8]]#1 dummy_scope %[[VAL_6]] {uniq_name = "_QFeoshift16cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_20:.*]]:2 = fir.unboxchar %[[ARG2]] : (!fir.boxchar<1>) -> (!fir.ref>, index) +// CHECK: %[[VAL_21:.*]] = fir.convert %[[VAL_20]]#0 : (!fir.ref>) -> !fir.ref>> +// CHECK: %[[VAL_22:.*]] = fir.load %[[VAL_7]]#0 : !fir.ref +// CHECK: %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> index +// CHECK: %[[VAL_24:.*]] = arith.cmpi sgt, %[[VAL_23]], %[[VAL_5]] : index +// CHECK: %[[VAL_25:.*]] = arith.select %[[VAL_24]], %[[VAL_23]], %[[VAL_5]] : index +// CHECK: %[[VAL_26:.*]] = fir.shape %[[VAL_25]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_21]](%[[VAL_26]]) typeparams %[[VAL_20]]#1 dummy_scope %[[VAL_6]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift16cEboundary"} : (!fir.ref>>, !fir.shape<1>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) +// CHECK: %[[VAL_28:.*]] = fir.is_present %[[VAL_27]]#0 : (!fir.box>>) -> i1 +// CHECK: %[[VAL_29:.*]] = fir.shape %[[VAL_25]] : (index) -> !fir.shape<1> +// CHECK: %[[VAL_30:.*]] = fir.embox %[[VAL_27]]#1(%[[VAL_29]]) typeparams %[[VAL_20]]#1 : (!fir.ref>>, !fir.shape<1>, index) -> !fir.box>> +// CHECK: %[[VAL_31:.*]] = fir.absent !fir.box>> +// CHECK: %[[VAL_32:.*]] = arith.select %[[VAL_28]], %[[VAL_30]], %[[VAL_31]] : !fir.box>> +// CHECK: %[[VAL_33:.*]] = fir.convert %[[VAL_13]] : (index) -> i64 +// CHECK: %[[VAL_34:.*]] = fir.convert %[[VAL_4]] : (i32) -> i64 +// CHECK: %[[VAL_35:.*]] = fir.alloca !fir.char<1,0> {bindc_name = ".chrtmp"} +// CHECK: %[[VAL_36:.*]] = fir.emboxchar %[[VAL_35]], %[[VAL_5]] : (!fir.ref>, index) -> !fir.boxchar<1> +// CHECK: %[[VAL_37:.*]] = fir.is_present %[[VAL_32]] : (!fir.box>>) -> i1 +// CHECK: %[[VAL_38:.*]] = arith.select %[[VAL_37]], %[[VAL_2]], %[[VAL_3]] : i1 +// CHECK: %[[VAL_39:.*]] = hlfir.elemental %[[VAL_18]] typeparams %[[VAL_8]]#1 unordered : (!fir.shape<2>, index) -> !hlfir.expr> { +// CHECK: ^bb0(%[[VAL_40:.*]]: index, %[[VAL_41:.*]]: index): +// CHECK: %[[VAL_42:.*]] = fir.if %[[VAL_38]] -> (!fir.boxchar<1>) { +// CHECK: fir.result %[[VAL_36]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: %[[VAL_43:.*]] = fir.box_elesize %[[VAL_32]] : (!fir.box>>) -> index +// CHECK: %[[VAL_44:.*]]:3 = fir.box_dims %[[VAL_32]], %[[VAL_5]] : (!fir.box>>, index) -> (index, index, index) +// CHECK: %[[VAL_45:.*]] = arith.subi %[[VAL_44]]#0, %[[VAL_1]] overflow : index +// CHECK: %[[VAL_46:.*]] = arith.addi %[[VAL_41]], %[[VAL_45]] overflow : index +// CHECK: %[[VAL_47:.*]] = hlfir.designate %[[VAL_32]] (%[[VAL_46]]) typeparams %[[VAL_43]] : (!fir.box>>, index, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_47]] : !fir.boxchar<1> +// CHECK: } +// CHECK: %[[VAL_48:.*]] = fir.convert %[[VAL_40]] : (index) -> i64 +// CHECK: %[[VAL_49:.*]] = arith.addi %[[VAL_48]], %[[VAL_34]] overflow : i64 +// CHECK: %[[VAL_50:.*]] = arith.cmpi sge, %[[VAL_49]], %[[VAL_0]] : i64 +// CHECK: %[[VAL_51:.*]] = arith.cmpi sle, %[[VAL_49]], %[[VAL_33]] : i64 +// CHECK: %[[VAL_52:.*]] = arith.andi %[[VAL_50]], %[[VAL_51]] : i1 +// CHECK: %[[VAL_53:.*]] = fir.if %[[VAL_52]] -> (!fir.boxchar<1>) { +// CHECK: %[[VAL_54:.*]] = fir.convert %[[VAL_49]] : (i64) -> index +// CHECK: %[[VAL_55:.*]] = hlfir.designate %[[VAL_19]]#0 (%[[VAL_54]], %[[VAL_41]]) typeparams %[[VAL_8]]#1 : (!fir.box>>, index, index, index) -> !fir.boxchar<1> +// CHECK: fir.result %[[VAL_55]] : !fir.boxchar<1> +// CHECK: } else { +// CHECK: fir.result %[[VAL_42]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.yield_element %[[VAL_53]] : !fir.boxchar<1> +// CHECK: } +// CHECK: hlfir.assign %[[VAL_39]] to %[[VAL_19]]#0 : !hlfir.expr>, !fir.box>> +// CHECK: hlfir.destroy %[[VAL_39]] : !hlfir.expr> +// CHECK: return +// CHECK: } + +// ! TODO: ARRAY or/and BOUNDARY are expressions of CHARACTER type. +// ! Test contiguous 1D array with the array expression boundary. +// ! CHARACTER with constant length. +// subroutine eoshift17c(n, array) +// interface +// function charc_boundary(n) +// integer :: n +// character(10,1) :: charc_boundary(n) +// end function +// end interface +// integer :: n +// character(10,1) :: array(n,n) +// array = EOSHIFT(array//array, 2, charc_boundary(n)) +// end subroutine +func.func @_QPeoshift17c(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.boxchar<1> {fir.bindc_name = "array"}) { + %c20 = arith.constant 20 : index + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %c10 = arith.constant 10 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift17cEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = fir.unboxchar %arg1 : (!fir.boxchar<1>) -> (!fir.ref>, index) + %3 = fir.convert %2#0 : (!fir.ref>) -> !fir.ref>> + %4 = fir.load %1#0 : !fir.ref + %5 = fir.convert %4 : (i32) -> index + %6 = arith.cmpi sgt, %5, %c0 : index + %7 = arith.select %6, %5, %c0 : index + %8 = fir.load %1#0 : !fir.ref + %9 = fir.convert %8 : (i32) -> index + %10 = arith.cmpi sgt, %9, %c0 : index + %11 = arith.select %10, %9, %c0 : index + %12 = fir.shape %7, %11 : (index, index) -> !fir.shape<2> + %13:2 = hlfir.declare %3(%12) typeparams %c10 dummy_scope %0 {uniq_name = "_QFeoshift17cEarray"} : (!fir.ref>>, !fir.shape<2>, index, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %14 = hlfir.elemental %12 typeparams %c20 unordered : (!fir.shape<2>, index) -> !hlfir.expr> { + ^bb0(%arg2: index, %arg3: index): + %23 = hlfir.designate %13#0 (%arg2, %arg3) typeparams %c10 : (!fir.box>>, index, index, index) -> !fir.ref> + %24 = hlfir.designate %13#0 (%arg2, %arg3) typeparams %c10 : (!fir.box>>, index, index, index) -> !fir.ref> + %25 = hlfir.concat %23, %24 len %c20 : (!fir.ref>, !fir.ref>, index) -> !hlfir.expr> + hlfir.yield_element %25 : !hlfir.expr> + } + %15:2 = hlfir.declare %1#0 {uniq_name = "_QFeoshift17cFcharc_boundaryEn"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %16 = fir.load %15#0 : !fir.ref + %17 = fir.convert %16 : (i32) -> index + %18 = arith.cmpi sgt, %17, %c0 : index + %19 = arith.select %18, %17, %c0 : index + %20 = fir.shape %19 : (index) -> !fir.shape<1> + %21 = hlfir.eval_in_mem shape %20 typeparams %c10 : (!fir.shape<1>, index) -> !hlfir.expr> { + ^bb0(%arg2: !fir.ref>>): + %23 = fir.call @_QPcharc_boundary(%1#0) fastmath : (!fir.ref) -> !fir.array> + fir.save_result %23 to %arg2(%20) typeparams %c10 : !fir.array>, !fir.ref>>, !fir.shape<1>, index + } + %22 = hlfir.eoshift %14 %c2_i32 boundary %21 : (!hlfir.expr>, i32, !hlfir.expr>) -> !hlfir.expr> + hlfir.assign %22 to %13#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %22 : !hlfir.expr> + hlfir.destroy %21 : !hlfir.expr> + hlfir.destroy %14 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift17c( +// CHECK: hlfir.eoshift + +// ! Tests for derived types. + +// ! TODO: selecting between !fir.ref> and !fir.box> +// ! is not implemented. +// ! Test contiguous 1D array with the scalar optional boundary. +// subroutine eoshift1d(n, array, boundary) +// use eoshift_types +// integer :: n +// type(t) :: array(n) +// type(t), optional :: boundary +// array = EOSHIFT(array, 2, boundary) +// end subroutine +func.func @_QPeoshift1d(%arg0: !fir.ref {fir.bindc_name = "n"}, %arg1: !fir.ref>> {fir.bindc_name = "array"}, %arg2: !fir.ref> {fir.bindc_name = "boundary", fir.optional}) { + %c2_i32 = arith.constant 2 : i32 + %c0 = arith.constant 0 : index + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {uniq_name = "_QFeoshift1dEn"} : (!fir.ref, !fir.dscope) -> (!fir.ref, !fir.ref) + %2:2 = hlfir.declare %arg2 dummy_scope %0 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFeoshift1dEboundary"} : (!fir.ref>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %3 = fir.load %1#0 : !fir.ref + %4 = fir.convert %3 : (i32) -> index + %5 = arith.cmpi sgt, %4, %c0 : index + %6 = arith.select %5, %4, %c0 : index + %7 = fir.shape %6 : (index) -> !fir.shape<1> + %8:2 = hlfir.declare %arg1(%7) dummy_scope %0 {uniq_name = "_QFeoshift1dEarray"} : (!fir.ref>>, !fir.shape<1>, !fir.dscope) -> (!fir.box>>, !fir.ref>>) + %9 = fir.is_present %2#0 : (!fir.ref>) -> i1 + %10 = fir.embox %2#0 : (!fir.ref>) -> !fir.box> + %11 = fir.absent !fir.box> + %12 = arith.select %9, %10, %11 : !fir.box> + %13 = hlfir.eoshift %8#0 %c2_i32 boundary %12 : (!fir.box>>, i32, !fir.box>) -> !hlfir.expr> + hlfir.assign %13 to %8#0 : !hlfir.expr>, !fir.box>> + hlfir.destroy %13 : !hlfir.expr> + return +} +// CHECK-LABEL: func.func @_QPeoshift1d( +// CHECK: hlfir.eoshift diff --git a/flang/test/Lower/Coarray/coarray-init.f90 b/flang/test/Lower/Coarray/coarray-init.f90 new file mode 100644 index 000000000000..055bc0fc4da7 --- /dev/null +++ b/flang/test/Lower/Coarray/coarray-init.f90 @@ -0,0 +1,11 @@ +! RUN: %flang_fc1 -emit-hlfir -fcoarray %s -o - | FileCheck %s --check-prefixes=ALL,COARRAY +! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s --check-prefixes=ALL,NOCOARRAY + +program test_init + +end + +! ALL-LABEL: func.func @main +! ALL: fir.call @_FortranAProgramStart +! COARRAY: fir.call @_QMprifPprif_init(%[[ARG:.*]]) fastmath : (!fir.ref) -> () +! NOCOARRAY-NOT: fir.call @_QMprifPprif_init(%[[ARG:.*]]) fastmath : (!fir.ref) -> () diff --git a/flang/test/Lower/Intrinsics/acosd.f90 b/flang/test/Lower/Intrinsics/acosd.f90 index 7dfa28fd6494..175a4902620b 100644 --- a/flang/test/Lower/Intrinsics/acosd.f90 +++ b/flang/test/Lower/Intrinsics/acosd.f90 @@ -1,3 +1,4 @@ +! REQUIRES: flang-supports-f128-math ! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s --check-prefixes="CHECK" function test_real4(x) @@ -6,9 +7,8 @@ function test_real4(x) end function ! CHECK-LABEL: @_QPtest_real4 -! CHECK: %[[dfactor:.*]] = arith.constant 57.295779513082323 : f64 +! CHECK: %[[factor:.*]] = arith.constant 57.2957763 : f32 ! CHECK: %[[result:.*]] = math.acos %{{.*}} fastmath : f32 -! CHECK: %[[factor:.*]] = fir.convert %[[dfactor]] : (f64) -> f32 ! CHECK: %[[arg:.*]] = arith.mulf %[[result]], %[[factor]] fastmath : f32 function test_real8(x) @@ -17,6 +17,16 @@ function test_real8(x) end function ! CHECK-LABEL: @_QPtest_real8 -! CHECK: %[[dfactor:.*]] = arith.constant 57.295779513082323 : f64 +! CHECK: %[[factor:.*]] = arith.constant 57.295779513082323 : f64 ! CHECK: %[[result:.*]] = math.acos %{{.*}} fastmath : f64 -! CHECK: %[[arg:.*]] = arith.mulf %[[result]], %[[dfactor]] fastmath : f64 +! CHECK: %[[arg:.*]] = arith.mulf %[[result]], %[[factor]] fastmath : f64 + +function test_real16(x) + real(16) :: x, test_real16 + test_real16 = acosd(x) +end function + +! CHECK-LABEL: @_QPtest_real16 +! CHECK: %[[factor:.*]] = arith.constant 57.295779513082320876798154814105{{.*}} : f128 +! CHECK: %[[result:.*]] = fir.call @_FortranAAcosF128({{.*}}) fastmath : (f128) -> f128 +! CHECK: %[[arg:.*]] = arith.mulf %[[result]], %[[factor]] fastmath : f128 diff --git a/flang/test/Lower/OpenMP/privatize_predetermined_only_when_defined_by_eval.f90 b/flang/test/Lower/OpenMP/privatize_predetermined_only_when_defined_by_eval.f90 new file mode 100644 index 000000000000..7671073c2598 --- /dev/null +++ b/flang/test/Lower/OpenMP/privatize_predetermined_only_when_defined_by_eval.f90 @@ -0,0 +1,35 @@ +! Fixes a regression uncovered by Fujitsu test 0686_0024.f90. In particular, +! verifies that a pre-determined symbol is only privatized by its defining +! evaluation (e.g. the loop for which the symbol was marked as pre-determined). + +! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s + +subroutine privatize_predetermined_when_defined_by_eval + integer::i,ii + integer::j + + !$omp parallel + !$omp do lastprivate(ii) + do i=1,10 + do ii=1,10 + enddo + enddo + + !$omp do + do j=1,ii + enddo + !$omp end parallel +end subroutine + +! Verify that nothing is privatized by the `omp.parallel` op. +! CHECK: omp.parallel { + +! Verify that `i` and `ii` are privatized by the first loop. +! CHECK: omp.wsloop private(@{{.*}}ii_private_i32 %{{.*}}#0 -> %{{.*}}, @{{.*}}i_private_i32 %2#0 -> %{{.*}} : {{.*}}) { +! CHECK: } + +! Verify that `j` is privatized by the second loop. +! CHECK: omp.wsloop private(@{{.*}}j_private_i32 %{{.*}}#0 -> %{{.*}} : {{.*}}) { +! CHECK: } + +! CHECK: } diff --git a/libc/benchmarks/gpu/BenchmarkLogger.cpp b/libc/benchmarks/gpu/BenchmarkLogger.cpp deleted file mode 100644 index d5996a74f6dd..000000000000 --- a/libc/benchmarks/gpu/BenchmarkLogger.cpp +++ /dev/null @@ -1,97 +0,0 @@ -#include "benchmarks/gpu/BenchmarkLogger.h" -#include "hdr/stdint_proxy.h" -#include "src/__support/CPP/string.h" -#include "src/__support/CPP/string_view.h" -#include "src/__support/OSUtil/io.h" // write_to_stderr -#include "src/__support/big_int.h" // is_big_int -#include "src/__support/macros/config.h" -#include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128 -#include "src/__support/uint128.h" - -namespace LIBC_NAMESPACE_DECL { -namespace benchmarks { - -// cpp::string_view specialization -template <> -BenchmarkLogger & - BenchmarkLogger::operator<< (cpp::string_view str) { - LIBC_NAMESPACE::write_to_stderr(str); - return *this; -} - -// cpp::string specialization -template <> -BenchmarkLogger &BenchmarkLogger::operator<< (cpp::string str) { - return *this << static_cast(str); -} - -// const char* specialization -template <> -BenchmarkLogger &BenchmarkLogger::operator<< (const char *str) { - return *this << cpp::string_view(str); -} - -// char* specialization -template <> BenchmarkLogger &BenchmarkLogger::operator<< (char *str) { - return *this << cpp::string_view(str); -} - -// char specialization -template <> BenchmarkLogger &BenchmarkLogger::operator<<(char ch) { - return *this << cpp::string_view(&ch, 1); -} - -// bool specialization -template <> BenchmarkLogger &BenchmarkLogger::operator<<(bool cond) { - return *this << (cond ? "true" : "false"); -} - -// void * specialization -template <> BenchmarkLogger &BenchmarkLogger::operator<<(void *addr) { - return *this << "0x" << cpp::to_string(reinterpret_cast(addr)); -} - -template BenchmarkLogger &BenchmarkLogger::operator<<(T t) { - if constexpr (is_big_int_v || - (cpp::is_integral_v && cpp::is_unsigned_v && - (sizeof(T) > sizeof(uint64_t)))) { - static_assert(sizeof(T) % 8 == 0, "Unsupported size of UInt"); - const IntegerToString buffer(t); - return *this << buffer.view(); - } else { - return *this << cpp::to_string(t); - } -} - -// is_integral specializations -// char is already specialized to handle character -template BenchmarkLogger &BenchmarkLogger::operator<< (short); -template BenchmarkLogger &BenchmarkLogger::operator<< (int); -template BenchmarkLogger &BenchmarkLogger::operator<< (long); -template BenchmarkLogger &BenchmarkLogger::operator<< (long long); -template BenchmarkLogger & - BenchmarkLogger::operator<< (unsigned char); -template BenchmarkLogger & - BenchmarkLogger::operator<< (unsigned short); -template BenchmarkLogger & - BenchmarkLogger::operator<< (unsigned int); -template BenchmarkLogger & - BenchmarkLogger::operator<< (unsigned long); -template BenchmarkLogger & - BenchmarkLogger::operator<< (unsigned long long); - -#ifdef LIBC_TYPES_HAS_INT128 -template BenchmarkLogger & - BenchmarkLogger::operator<< <__uint128_t>(__uint128_t); -#endif // LIBC_TYPES_HAS_INT128 -template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<128>); -template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<192>); -template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<256>); -template BenchmarkLogger &BenchmarkLogger::operator<< >(UInt<320>); - -// TODO: Add floating point formatting once it's supported by StringStream. - -BenchmarkLogger log; - -} // namespace benchmarks -} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/benchmarks/gpu/BenchmarkLogger.h b/libc/benchmarks/gpu/BenchmarkLogger.h deleted file mode 100644 index 2b22aba085f8..000000000000 --- a/libc/benchmarks/gpu/BenchmarkLogger.h +++ /dev/null @@ -1,29 +0,0 @@ -//===-- Utilities to log to standard output during tests --------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H -#define LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H - -#include "src/__support/macros/config.h" - -namespace LIBC_NAMESPACE_DECL { -namespace benchmarks { - -// A class to log to standard output in the context of hermetic tests. -struct BenchmarkLogger { - constexpr BenchmarkLogger() = default; - template BenchmarkLogger &operator<<(T); -}; - -// A global TestLogger instance to be used in tests. -extern BenchmarkLogger log; - -} // namespace benchmarks -} // namespace LIBC_NAMESPACE_DECL - -#endif /* LLVM_LIBC_BENCHMARKS_GPU_BENCHMARKLOGGER_H */ diff --git a/libc/benchmarks/gpu/CMakeLists.txt b/libc/benchmarks/gpu/CMakeLists.txt index beedac78d482..6ca134b12a47 100644 --- a/libc/benchmarks/gpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/CMakeLists.txt @@ -38,31 +38,25 @@ add_unittest_framework_library( SRCS LibcGpuBenchmark.cpp LibcGpuBenchmarkMain.cpp - BenchmarkLogger.cpp HDRS LibcGpuBenchmark.h - BenchmarkLogger.h DEPENDS + libc.benchmarks.gpu.timing.timing libc.hdr.stdint_proxy - libc.src.__support.big_int - libc.src.__support.c_string libc.src.__support.CPP.string libc.src.__support.CPP.string_view libc.src.__support.CPP.type_traits - libc.src.__support.CPP.limits libc.src.__support.CPP.algorithm libc.src.__support.CPP.atomic libc.src.__support.CPP.array - libc.src.__support.fixed_point.fx_rep - libc.src.__support.macros.properties.types - libc.src.__support.OSUtil.osutil - libc.src.__support.uint128 libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.nearest_integer_operations libc.src.__support.FPUtil.sqrt libc.src.__support.fixedvector - libc.src.time.clock - libc.benchmarks.gpu.timing.timing + libc.src.__support.GPU.utils + libc.src.__support.time.gpu.time_utils libc.src.stdio.printf + libc.src.time.clock ) add_subdirectory(src) diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp index ef816c51a87d..a4a0ff4ec46e 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.cpp +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.cpp @@ -2,16 +2,17 @@ #include "hdr/stdint_proxy.h" #include "src/__support/CPP/algorithm.h" -#include "src/__support/CPP/array.h" #include "src/__support/CPP/atomic.h" #include "src/__support/CPP/string.h" #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/NearestIntegerOperations.h" #include "src/__support/FPUtil/sqrt.h" #include "src/__support/GPU/utils.h" #include "src/__support/fixedvector.h" #include "src/__support/macros/config.h" #include "src/__support/time/gpu/time_utils.h" #include "src/stdio/printf.h" +#include "src/time/clock.h" namespace LIBC_NAMESPACE_DECL { namespace benchmarks { @@ -134,11 +135,13 @@ void print_results(Benchmark *b) { cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); LIBC_NAMESPACE::printf( - "%-24s |%15.0f |%9.0f |%8llu |%8llu |%11llu |%9u |\n", + "%-24s |%15.0f |%9.0f |%8llu |%8llu |%15llu |%9u |\n", b->get_test_name().data(), final_result.cycles, - final_result.standard_deviation, (unsigned long long)final_result.min, - (unsigned long long)final_result.max, - (unsigned long long)final_result.total_iterations, (unsigned)num_threads); + final_result.standard_deviation, + static_cast(final_result.min), + static_cast(final_result.max), + static_cast(final_result.total_iterations), + static_cast(num_threads)); } void print_header() { @@ -147,7 +150,7 @@ void print_header() { benchmarks[0]->get_suite_name().data()); LIBC_NAMESPACE::printf("%s", RESET); cpp::string titles = "Benchmark | Cycles (Mean) | Stddev | " - " Min | Max | Iterations | Threads |\n"; + " Min | Max | Iterations | Threads |\n"; LIBC_NAMESPACE::printf(titles.data()); cpp::string separator(titles.size(), '-'); @@ -226,7 +229,8 @@ BenchmarkResult benchmark(const BenchmarkOptions &options, change_ratio < options.epsilon) break; - iterations = static_cast(iterations * options.scaling_factor); + iterations = static_cast( + fputil::ceil(iterations * options.scaling_factor)); } const auto &estimator = rep.get_estimator(); diff --git a/libc/benchmarks/gpu/LibcGpuBenchmark.h b/libc/benchmarks/gpu/LibcGpuBenchmark.h index 60f69edf8655..e36e93c7efc1 100644 --- a/libc/benchmarks/gpu/LibcGpuBenchmark.h +++ b/libc/benchmarks/gpu/LibcGpuBenchmark.h @@ -1,18 +1,16 @@ #ifndef LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H #define LLVM_LIBC_BENCHMARKS_LIBC_GPU_BENCHMARK_H -#include "benchmarks/gpu/BenchmarkLogger.h" #include "benchmarks/gpu/timing/timing.h" + #include "hdr/stdint_proxy.h" #include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" -#include "src/__support/CPP/limits.h" #include "src/__support/CPP/string_view.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/sqrt.h" #include "src/__support/macros/config.h" -#include "src/time/clock.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt index d6a89d04dab9..f85152e69c34 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt +++ b/libc/benchmarks/gpu/timing/amdgpu/CMakeLists.txt @@ -4,10 +4,11 @@ add_header_library( timing.h DEPENDS libc.hdr.stdint_proxy - libc.src.__support.common libc.src.__support.macros.config libc.src.__support.macros.attributes libc.src.__support.CPP.algorithm libc.src.__support.CPP.array + libc.src.__support.CPP.atomic libc.src.__support.CPP.type_traits + libc.src.__support.GPU.utils ) diff --git a/libc/benchmarks/gpu/timing/amdgpu/timing.h b/libc/benchmarks/gpu/timing/amdgpu/timing.h index de721a2d6ce6..8b92584b3923 100644 --- a/libc/benchmarks/gpu/timing/amdgpu/timing.h +++ b/libc/benchmarks/gpu/timing/amdgpu/timing.h @@ -15,7 +15,6 @@ #include "src/__support/CPP/atomic.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/GPU/utils.h" -#include "src/__support/common.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" @@ -118,6 +117,8 @@ throughput_baseline(const cpp::array &inputs) { asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"v"(input)); result = input; @@ -147,6 +148,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"v"(input)); result = f(input); @@ -175,6 +178,8 @@ static LIBC_INLINE uint64_t throughput_baseline( asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; @@ -207,6 +212,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, asm("" ::"s"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; diff --git a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt index 801080e7a6e9..4615f53e3d24 100644 --- a/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt +++ b/libc/benchmarks/gpu/timing/nvptx/CMakeLists.txt @@ -4,10 +4,11 @@ add_header_library( timing.h DEPENDS libc.hdr.stdint_proxy - libc.src.__support.common libc.src.__support.macros.config libc.src.__support.macros.attributes libc.src.__support.CPP.algorithm libc.src.__support.CPP.array + libc.src.__support.CPP.atomic libc.src.__support.CPP.type_traits + libc.src.__support.GPU.utils ) diff --git a/libc/benchmarks/gpu/timing/nvptx/timing.h b/libc/benchmarks/gpu/timing/nvptx/timing.h index 133032ca0842..944d3732eae6 100644 --- a/libc/benchmarks/gpu/timing/nvptx/timing.h +++ b/libc/benchmarks/gpu/timing/nvptx/timing.h @@ -13,9 +13,7 @@ #include "src/__support/CPP/algorithm.h" #include "src/__support/CPP/array.h" #include "src/__support/CPP/atomic.h" -#include "src/__support/CPP/type_traits.h" #include "src/__support/GPU/utils.h" -#include "src/__support/common.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" @@ -66,7 +64,7 @@ template uint64_t stop = gpu::processor_clock(); cpp::atomic_thread_fence(cpp::MemoryOrder::ACQ_REL); asm("" ::"r"(stop)); - volatile T output = result; + volatile auto output = result; // Return the time elapsed. return stop - start; @@ -108,6 +106,8 @@ throughput_baseline(const cpp::array &inputs) { asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"r"(input)); result = input; @@ -137,6 +137,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs) { asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (auto input : inputs) { asm("" ::"r"(input)); result = f(input); @@ -165,6 +167,8 @@ static LIBC_INLINE uint64_t throughput_baseline( asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; @@ -197,6 +201,8 @@ static LIBC_INLINE uint64_t throughput(F f, const cpp::array &inputs1, asm("" ::"llr"(start)); T result{}; + +#pragma clang loop unroll(disable) for (size_t i = 0; i < N; i++) { T x = inputs1[i]; T y = inputs2[i]; diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index e71dc2ee0d02..1bc5df9d45a9 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -660,6 +660,7 @@ if(LIBC_TYPES_HAS_FLOAT16) list(APPEND TARGET_LIBM_ENTRYPOINTS # math.h C23 _Float16 entrypoints # libc.src.math.acoshf16 + libc.src.math.asinpif16 libc.src.math.canonicalizef16 libc.src.math.ceilf16 libc.src.math.copysignf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 1ee10e6d3cad..1fc9a2b901c1 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -704,6 +704,7 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.acospif16 libc.src.math.asinf16 libc.src.math.asinhf16 + libc.src.math.asinpif16 libc.src.math.atanf16 libc.src.math.atanhf16 libc.src.math.canonicalizef16 diff --git a/libc/docs/headers/math/index.rst b/libc/docs/headers/math/index.rst index add34d0e877f..de2849d1418d 100644 --- a/libc/docs/headers/math/index.rst +++ b/libc/docs/headers/math/index.rst @@ -268,7 +268,7 @@ Higher Math Functions +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | asinh | |check| | | | |check| | | 7.12.5.2 | F.10.2.2 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| asinpi | | | | | | 7.12.4.9 | F.10.1.9 | +| asinpi | | | | |check| | | 7.12.4.9 | F.10.1.9 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | atan | |check| | 1 ULP | | |check| | | 7.12.4.3 | F.10.1.3 | +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/include/math.yaml b/libc/include/math.yaml index e8ac7ee5033d..4e398676bf91 100644 --- a/libc/include/math.yaml +++ b/libc/include/math.yaml @@ -79,6 +79,13 @@ functions: arguments: - type: _Float16 guard: LIBC_TYPES_HAS_FLOAT16 + - name: asinpif16 + standards: + - stdc + return_type: _Float16 + arguments: + - type: _Float16 + guard: LIBC_TYPES_HAS_FLOAT16 - name: atan standards: - stdc diff --git a/libc/include/sched.yaml b/libc/include/sched.yaml index f14799ddf33f..8014aa7ed61f 100644 --- a/libc/include/sched.yaml +++ b/libc/include/sched.yaml @@ -20,7 +20,7 @@ functions: - type: const cpu_set_t * - name: getcpu standards: - - POSIX + - Linux return_type: int arguments: - type: unsigned int * diff --git a/libc/include/sys/ioctl.yaml b/libc/include/sys/ioctl.yaml index 5f7b7f333191..7eb66b657664 100644 --- a/libc/include/sys/ioctl.yaml +++ b/libc/include/sys/ioctl.yaml @@ -5,4 +5,12 @@ macros: [] types: [] enums: [] objects: [] -functions: [] +functions: + - name: ioctl + standards: + - Linux + return_type: int + arguments: + - type: int + - type: unsigned long + - type: '...' diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 8db5901afa9c..187bc92e5c2c 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -58,6 +58,8 @@ add_math_entrypoint_object(asinh) add_math_entrypoint_object(asinhf) add_math_entrypoint_object(asinhf16) +add_math_entrypoint_object(asinpif16) + add_math_entrypoint_object(atan) add_math_entrypoint_object(atanf) add_math_entrypoint_object(atanf16) diff --git a/libc/src/math/asinpif16.h b/libc/src/math/asinpif16.h new file mode 100644 index 000000000000..b97166af63f5 --- /dev/null +++ b/libc/src/math/asinpif16.h @@ -0,0 +1,21 @@ +//===-- Implementation header for asinpif16 ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_ASINPIF16_H +#define LLVM_LIBC_SRC_MATH_ASINPIF16_H + +#include "src/__support/macros/config.h" +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE_DECL { + +float16 asinpif16(float16 x); + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC_MATH_ASINPIF16_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 65e483926844..22aa3cfedfbc 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -4035,6 +4035,25 @@ add_entrypoint_object( libc.src.__support.math.asinhf16 ) +add_entrypoint_object( + asinpif16 + SRCS + asinpif16.cpp + HDRS + ../asinpif16.h + DEPENDS + libc.hdr.errno_macros + libc.hdr.fenv_macros + libc.src.__support.FPUtil.cast + libc.src.__support.FPUtil.except_value_utils + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits + libc.src.__support.FPUtil.multiply_add + libc.src.__support.FPUtil.polyeval + libc.src.__support.FPUtil.sqrt + libc.src.__support.macros.optimization +) + add_entrypoint_object( atanhf SRCS diff --git a/libc/src/math/generic/asinpif16.cpp b/libc/src/math/generic/asinpif16.cpp new file mode 100644 index 000000000000..aabc0863ba52 --- /dev/null +++ b/libc/src/math/generic/asinpif16.cpp @@ -0,0 +1,127 @@ +//===-- Half-precision asinpif16(x) function ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception. +// +//===----------------------------------------------------------------------===// + +#include "src/math/asinpif16.h" +#include "hdr/errno_macros.h" +#include "hdr/fenv_macros.h" +#include "src/__support/FPUtil/FEnvImpl.h" +#include "src/__support/FPUtil/FPBits.h" +#include "src/__support/FPUtil/PolyEval.h" +#include "src/__support/FPUtil/cast.h" +#include "src/__support/FPUtil/except_value_utils.h" +#include "src/__support/FPUtil/multiply_add.h" +#include "src/__support/FPUtil/sqrt.h" +#include "src/__support/macros/optimization.h" + +namespace LIBC_NAMESPACE_DECL { + +LLVM_LIBC_FUNCTION(float16, asinpif16, (float16 x)) { + using FPBits = fputil::FPBits; + + FPBits xbits(x); + bool is_neg = xbits.is_neg(); + double x_abs = fputil::cast(xbits.abs().get_val()); + + auto signed_result = [is_neg](auto r) -> auto { return is_neg ? -r : r; }; + + if (LIBC_UNLIKELY(x_abs > 1.0)) { + // aspinf16(NaN) = NaN + if (xbits.is_nan()) { + if (xbits.is_signaling_nan()) { + fputil::raise_except_if_required(FE_INVALID); + return FPBits::quiet_nan().get_val(); + } + return x; + } + + // 1 < |x| <= +/-inf + fputil::raise_except_if_required(FE_INVALID); + fputil::set_errno_if_required(EDOM); + + return FPBits::quiet_nan().get_val(); + } + + // the coefficients for the polynomial approximation of asin(x)/pi in the + // range [0, 0.5] extracted using python-sympy + // + // Python code to generate the coefficients: + // > from sympy import * + // > import math + // > x = symbols('x') + // > print(series(asin(x)/math.pi, x, 0, 21)) + // + // OUTPUT: + // + // 0.318309886183791*x + 0.0530516476972984*x**3 + 0.0238732414637843*x**5 + + // 0.0142102627760621*x**7 + 0.00967087327815336*x**9 + + // 0.00712127941391293*x**11 + 0.00552355646848375*x**13 + + // 0.00444514782463692*x**15 + 0.00367705242846804*x**17 + + // 0.00310721681820837*x**19 + O(x**21) + // + // it's very accurate in the range [0, 0.5] and has a maximum error of + // 0.0000000000000001 in the range [0, 0.5]. + constexpr double POLY_COEFFS[] = { + 0x1.45f306dc9c889p-2, // x^1 + 0x1.b2995e7b7b5fdp-5, // x^3 + 0x1.8723a1d588a36p-6, // x^5 + 0x1.d1a452f20430dp-7, // x^7 + 0x1.3ce52a3a09f61p-7, // x^9 + 0x1.d2b33e303d375p-8, // x^11 + 0x1.69fde663c674fp-8, // x^13 + 0x1.235134885f19bp-8, // x^15 + }; + // polynomial evaluation using horner's method + // work only for |x| in [0, 0.5] + auto asinpi_polyeval = [](double x) -> double { + return x * fputil::polyeval(x * x, POLY_COEFFS[0], POLY_COEFFS[1], + POLY_COEFFS[2], POLY_COEFFS[3], POLY_COEFFS[4], + POLY_COEFFS[5], POLY_COEFFS[6], POLY_COEFFS[7]); + }; + + // if |x| <= 0.5: + if (LIBC_UNLIKELY(x_abs <= 0.5)) { + // Use polynomial approximation of asin(x)/pi in the range [0, 0.5] + double result = asinpi_polyeval(fputil::cast(x)); + return fputil::cast(result); + } + + // If |x| > 0.5, we need to use the range reduction method: + // y = asin(x) => x = sin(y) + // because: sin(a) = cos(pi/2 - a) + // therefore: + // x = cos(pi/2 - y) + // let z = pi/2 - y, + // x = cos(z) + // because: cos(2a) = 1 - 2 * sin^2(a), z = 2a, a = z/2 + // therefore: + // cos(z) = 1 - 2 * sin^2(z/2) + // sin(z/2) = sqrt((1 - cos(z))/2) + // sin(z/2) = sqrt((1 - x)/2) + // let u = (1 - x)/2 + // then: + // sin(z/2) = sqrt(u) + // z/2 = asin(sqrt(u)) + // z = 2 * asin(sqrt(u)) + // pi/2 - y = 2 * asin(sqrt(u)) + // y = pi/2 - 2 * asin(sqrt(u)) + // y/pi = 1/2 - 2 * asin(sqrt(u))/pi + // + // Finally, we can write: + // asinpi(x) = 1/2 - 2 * asinpi(sqrt(u)) + // where u = (1 - x) /2 + // = 0.5 - 0.5 * x + // = multiply_add(-0.5, x, 0.5) + + double u = fputil::multiply_add(-0.5, x_abs, 0.5); + double asinpi_sqrt_u = asinpi_polyeval(fputil::sqrt(u)); + double result = fputil::multiply_add(-2.0, asinpi_sqrt_u, 0.5); + + return fputil::cast(signed_result(result)); +} + +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt index 11bbf670c98d..e15df147c3c3 100644 --- a/libc/test/src/math/CMakeLists.txt +++ b/libc/test/src/math/CMakeLists.txt @@ -2282,6 +2282,17 @@ add_fp_unittest( libc.src.math.asinf16 ) +add_fp_unittest( + asinpif16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + asinpif16_test.cpp + DEPENDS + libc.src.math.asinpif16 +) + add_fp_unittest( acosf_test NEED_MPFR diff --git a/libc/test/src/math/asinpif16_test.cpp b/libc/test/src/math/asinpif16_test.cpp new file mode 100644 index 000000000000..3718f39fd06a --- /dev/null +++ b/libc/test/src/math/asinpif16_test.cpp @@ -0,0 +1,40 @@ +//===-- Exhaustive test for asinpif16 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/asinpif16.h" +#include "test/UnitTest/FPMatcher.h" +#include "test/UnitTest/Test.h" +#include "utils/MPFRWrapper/MPFRUtils.h" + +using LlvmLibcAsinpif16Test = LIBC_NAMESPACE::testing::FPTest; + +namespace mpfr = LIBC_NAMESPACE::testing::mpfr; + +// Range: [0, Inf] +static constexpr uint16_t POS_START = 0x0000U; +static constexpr uint16_t POS_STOP = 0x7c00U; + +// Range: [-Inf, 0] +static constexpr uint16_t NEG_START = 0x8000U; +static constexpr uint16_t NEG_STOP = 0xfc00U; + +TEST_F(LlvmLibcAsinpif16Test, PositiveRange) { + for (uint16_t v = POS_START; v <= POS_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinpi, x, + LIBC_NAMESPACE::asinpif16(x), 0.5); + } +} + +TEST_F(LlvmLibcAsinpif16Test, NegativeRange) { + for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) { + float16 x = FPBits(v).get_val(); + EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinpi, x, + LIBC_NAMESPACE::asinpif16(x), 0.5); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 00881bd27f24..b800f7aba98d 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -4216,6 +4216,18 @@ add_fp_unittest( libc.src.math.asinhf16 ) +add_fp_unittest( + asinpif16_test + NEED_MPFR + SUITE + libc-math-unittests + SRCS + asinpif16_test.cpp + DEPENDS + libc.src.math.asinpif16 + libc.src.errno.errno +) + add_fp_unittest( acoshf_test SUITE diff --git a/libc/test/src/math/smoke/asinpif16_test.cpp b/libc/test/src/math/smoke/asinpif16_test.cpp new file mode 100644 index 000000000000..5303eed8f5da --- /dev/null +++ b/libc/test/src/math/smoke/asinpif16_test.cpp @@ -0,0 +1,86 @@ +//===-- Unittests for asinpif16 -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/libc_errno.h" +#include "src/math/asinpif16.h" +#include "test/UnitTest/FPMatcher.h" + +using LlvmLibcAsinpif16Test = LIBC_NAMESPACE::testing::FPTest; + +TEST_F(LlvmLibcAsinpif16Test, SpecialNumbers) { + // zero + EXPECT_FP_EQ(zero, LIBC_NAMESPACE::asinpif16(zero)); + + // +/-1 + EXPECT_FP_EQ(0.5f16, LIBC_NAMESPACE::asinpif16(1.0)); + EXPECT_FP_EQ(-0.5f16, LIBC_NAMESPACE::asinpif16(-1.0)); + + // NaN inputs + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(FPBits::quiet_nan().get_val())); + + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(FPBits::signaling_nan().get_val())); + + // infinity inputs -> should return NaN + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), LIBC_NAMESPACE::asinpif16(inf)); + EXPECT_MATH_ERRNO(EDOM); + + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(neg_inf)); + EXPECT_MATH_ERRNO(EDOM); +} + +TEST_F(LlvmLibcAsinpif16Test, OutOfRange) { + // Test values > 1 + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(1.5f16)); + EXPECT_MATH_ERRNO(EDOM); + + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(2.0f16)); + EXPECT_MATH_ERRNO(EDOM); + + // Test values < -1 + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(-1.5f16)); + EXPECT_MATH_ERRNO(EDOM); + + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(-2.0f16)); + EXPECT_MATH_ERRNO(EDOM); + + // Test maximum normal value (should be > 1 for float16) + libc_errno = 0; + EXPECT_FP_EQ(FPBits::quiet_nan().get_val(), + LIBC_NAMESPACE::asinpif16(FPBits::max_normal().get_val())); + EXPECT_MATH_ERRNO(EDOM); +} + +TEST_F(LlvmLibcAsinpif16Test, SymmetryProperty) { + // Test that asinpi(-x) = -asinpi(x) + constexpr float16 TEST_VALS[] = {0.1f16, 0.25f16, 0.5f16, 0.75f16, + 0.9f16, 0.99f16, 1.0f16}; + + for (float16 x : TEST_VALS) { + FPBits neg_x_bits(x); + neg_x_bits.set_sign(Sign::NEG); + float16 neg_x = neg_x_bits.get_val(); + + float16 pos_result = LIBC_NAMESPACE::asinpif16(x); + float16 neg_result = LIBC_NAMESPACE::asinpif16(neg_x); + + EXPECT_FP_EQ(pos_result, FPBits(neg_result).abs().get_val()); + } +} diff --git a/libc/utils/MPFRWrapper/MPCommon.cpp b/libc/utils/MPFRWrapper/MPCommon.cpp index 07339a06fff8..77039d4bf7df 100644 --- a/libc/utils/MPFRWrapper/MPCommon.cpp +++ b/libc/utils/MPFRWrapper/MPCommon.cpp @@ -105,6 +105,21 @@ MPFRNumber MPFRNumber::asinh() const { return result; } +MPFRNumber MPFRNumber::asinpi() const { + MPFRNumber result(*this); +#if MPFR_VERSION >= MPFR_VERSION_NUM(4, 2, 0) + mpfr_asinpi(result.value, value, mpfr_rounding); + return result; +#else + MPFRNumber value_asin(0.0, 1280); + mpfr_asin(value_asin.value, value, MPFR_RNDN); + MPFRNumber value_pi(0.0, 1280); + mpfr_const_pi(value_pi.value, MPFR_RNDN); + mpfr_div(result.value, value_asin.value, value_pi.value, mpfr_rounding); + return result; +#endif +} + MPFRNumber MPFRNumber::atan() const { MPFRNumber result(*this); mpfr_atan(result.value, value, mpfr_rounding); diff --git a/libc/utils/MPFRWrapper/MPCommon.h b/libc/utils/MPFRWrapper/MPCommon.h index 8bcc69c247a3..47d6293c06af 100644 --- a/libc/utils/MPFRWrapper/MPCommon.h +++ b/libc/utils/MPFRWrapper/MPCommon.h @@ -189,6 +189,7 @@ public: MPFRNumber add(const MPFRNumber &b) const; MPFRNumber asin() const; MPFRNumber asinh() const; + MPFRNumber asinpi() const; MPFRNumber atan() const; MPFRNumber atan2(const MPFRNumber &b); MPFRNumber atanh() const; diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp index 3ab129a1a6fc..ff3bebb1c500 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.cpp +++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp @@ -39,6 +39,8 @@ unary_operation(Operation op, InputType input, unsigned int precision, return mpfrInput.asin(); case Operation::Asinh: return mpfrInput.asinh(); + case Operation::Asinpi: + return mpfrInput.asinpi(); case Operation::Atan: return mpfrInput.atan(); case Operation::Atanh: diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h index 45468c6cb19a..e805607328f6 100644 --- a/libc/utils/MPFRWrapper/MPFRUtils.h +++ b/libc/utils/MPFRWrapper/MPFRUtils.h @@ -30,6 +30,7 @@ enum class Operation : int { Acospi, Asin, Asinh, + Asinpi, Atan, Atanh, Cbrt, diff --git a/libclc/clc/include/clc/atomic/atomic_decl.inc b/libclc/clc/include/clc/atomic/atomic_decl.inc index b790a94c7d28..5e0f456e3400 100644 --- a/libclc/clc/include/clc/atomic/atomic_decl.inc +++ b/libclc/clc/include/clc/atomic/atomic_decl.inc @@ -14,23 +14,23 @@ #ifdef __CLC_NO_VALUE_ARG #define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ int MemoryScope); #elif defined(__CLC_RETURN_VOID) #define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \ - __CLC_GENTYPE Value, int MemoryOrder, \ - int MemoryScope); + _CLC_OVERLOAD _CLC_DECL void __CLC_FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ + int MemoryOrder, int MemoryScope); #elif defined(__CLC_COMPARE_EXCHANGE) #define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Comparator, \ __CLC_GENTYPE Value, int MemoryOrderEqual, int MemoryOrderUnequal, \ int MemoryScope); #else #define __CLC_DECLARE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ int MemoryOrder, int MemoryScope); #endif diff --git a/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h index ae7918ac32e4..31b816722350 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_compare_exchange.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_compare_exchange +#define __CLC_FUNCTION __clc_atomic_compare_exchange #define __CLC_COMPARE_EXCHANGE #define __CLC_BODY @@ -21,6 +21,6 @@ #include #undef __CLC_COMPARE_EXCHANGE -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_COMPARE_EXCHANGE_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_dec.h b/libclc/clc/include/clc/atomic/clc_atomic_dec.h index ada36ba3ff9b..66302b6b9ade 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_dec.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_dec.h @@ -11,13 +11,13 @@ #include -#define FUNCTION __clc_atomic_dec +#define __CLC_FUNCTION __clc_atomic_dec #define __CLC_NO_VALUE_ARG #define __CLC_BODY #include #undef __CLC_NO_VALUE_ARG -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_DEC_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_exchange.h b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h index 7e626d4a8830..321cfb403085 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_exchange.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_exchange.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_exchange +#define __CLC_FUNCTION __clc_atomic_exchange #define __CLC_BODY #include @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_EXCHANGE_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h index ad0c2eb4607a..4ef29fce50af 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_add.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_fetch_add +#define __CLC_FUNCTION __clc_atomic_fetch_add #define __CLC_BODY #include @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_ADD_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h index 80810c38cbbb..688c11287db5 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_and.h @@ -11,11 +11,11 @@ #include -#define FUNCTION __clc_atomic_fetch_and +#define __CLC_FUNCTION __clc_atomic_fetch_and #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_AND_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h index 56f511922e5c..a4e44b88a697 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_max.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_fetch_max +#define __CLC_FUNCTION __clc_atomic_fetch_max #define __CLC_BODY #include @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MAX_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h index f17408d28a35..b58b538649e1 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_min.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_fetch_min +#define __CLC_FUNCTION __clc_atomic_fetch_min #define __CLC_BODY #include @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_MIN_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h index b82069e6f960..ab303cc673ba 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_or.h @@ -11,11 +11,11 @@ #include -#define FUNCTION __clc_atomic_fetch_or +#define __CLC_FUNCTION __clc_atomic_fetch_or #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_OR_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h index 6cfd224629d6..d9deae085a61 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_sub.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_fetch_sub +#define __CLC_FUNCTION __clc_atomic_fetch_sub #define __CLC_BODY #include @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_SUB_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h index b007b47a9369..fe13a641fc1c 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_fetch_xor.h @@ -11,11 +11,11 @@ #include -#define FUNCTION __clc_atomic_fetch_xor +#define __CLC_FUNCTION __clc_atomic_fetch_xor #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_FETCH_XOR_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_inc.h b/libclc/clc/include/clc/atomic/clc_atomic_inc.h index 3ddef4a8bf35..c6fcdad30949 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_inc.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_inc.h @@ -11,13 +11,13 @@ #include -#define FUNCTION __clc_atomic_inc +#define __CLC_FUNCTION __clc_atomic_inc #define __CLC_NO_VALUE_ARG #define __CLC_BODY #include #undef __CLC_NO_VALUE_ARG -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_INC_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_load.h b/libclc/clc/include/clc/atomic/clc_atomic_load.h index a4899b34b88a..3abfce86b194 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_load.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_load.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_load +#define __CLC_FUNCTION __clc_atomic_load #define __CLC_NO_VALUE_ARG #define __CLC_BODY @@ -21,6 +21,6 @@ #include #undef __CLC_NO_VALUE_ARG -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_LOAD_H__ diff --git a/libclc/clc/include/clc/atomic/clc_atomic_store.h b/libclc/clc/include/clc/atomic/clc_atomic_store.h index 6baf0eb7ea32..94d77621735d 100644 --- a/libclc/clc/include/clc/atomic/clc_atomic_store.h +++ b/libclc/clc/include/clc/atomic/clc_atomic_store.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_atomic_store +#define __CLC_FUNCTION __clc_atomic_store #define __CLC_RETURN_VOID #define __CLC_BODY @@ -21,6 +21,6 @@ #include #undef __CLC_RETURN_VOID -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_ATOMIC_CLC_ATOMIC_STORE_H__ diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h index 5c67c937cb1d..9fa11489b145 100644 --- a/libclc/clc/include/clc/clcmacro.h +++ b/libclc/clc/include/clc/clcmacro.h @@ -12,58 +12,58 @@ #include #include -#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \ +#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, __CLC_FUNCTION, ARG1_TYPE, \ ADDR_SPACE, ARG2_TYPE) \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 2) \ - FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x, \ - ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) { \ + __CLC_FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x, \ + ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) { \ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ - return (__CLC_XCONCAT(RET_TYPE, 2))(FUNCTION(x.s0, ptr), \ - FUNCTION(x.s1, ptr + 1)); \ + return (__CLC_XCONCAT(RET_TYPE, 2))(__CLC_FUNCTION(x.s0, ptr), \ + __CLC_FUNCTION(x.s1, ptr + 1)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 3) \ - FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x, \ - ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) { \ + __CLC_FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x, \ + ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) { \ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ - return (__CLC_XCONCAT(RET_TYPE, 3))(FUNCTION(x.s0, ptr), \ - FUNCTION(x.s1, ptr + 1), \ - FUNCTION(x.s2, ptr + 2)); \ + return (__CLC_XCONCAT(RET_TYPE, 3))(__CLC_FUNCTION(x.s0, ptr), \ + __CLC_FUNCTION(x.s1, ptr + 1), \ + __CLC_FUNCTION(x.s2, ptr + 2)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 4) \ - FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x, \ - ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) { \ + __CLC_FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x, \ + ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) { \ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 4))( \ - FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ - FUNCTION(x.s3, ptr + 3)); \ + __CLC_FUNCTION(x.s0, ptr), __CLC_FUNCTION(x.s1, ptr + 1), \ + __CLC_FUNCTION(x.s2, ptr + 2), __CLC_FUNCTION(x.s3, ptr + 3)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 8) \ - FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x, \ - ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) { \ + __CLC_FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x, \ + ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) { \ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 8))( \ - FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ - FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \ - FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \ - FUNCTION(x.s7, ptr + 7)); \ + __CLC_FUNCTION(x.s0, ptr), __CLC_FUNCTION(x.s1, ptr + 1), \ + __CLC_FUNCTION(x.s2, ptr + 2), __CLC_FUNCTION(x.s3, ptr + 3), \ + __CLC_FUNCTION(x.s4, ptr + 4), __CLC_FUNCTION(x.s5, ptr + 5), \ + __CLC_FUNCTION(x.s6, ptr + 6), __CLC_FUNCTION(x.s7, ptr + 7)); \ } \ \ DECLSPEC __CLC_XCONCAT(RET_TYPE, 16) \ - FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x, \ - ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) { \ + __CLC_FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x, \ + ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) { \ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \ return (__CLC_XCONCAT(RET_TYPE, 16))( \ - FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \ - FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \ - FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \ - FUNCTION(x.s7, ptr + 7), FUNCTION(x.s8, ptr + 8), \ - FUNCTION(x.s9, ptr + 9), FUNCTION(x.sa, ptr + 10), \ - FUNCTION(x.sb, ptr + 11), FUNCTION(x.sc, ptr + 12), \ - FUNCTION(x.sd, ptr + 13), FUNCTION(x.se, ptr + 14), \ - FUNCTION(x.sf, ptr + 15)); \ + __CLC_FUNCTION(x.s0, ptr), __CLC_FUNCTION(x.s1, ptr + 1), \ + __CLC_FUNCTION(x.s2, ptr + 2), __CLC_FUNCTION(x.s3, ptr + 3), \ + __CLC_FUNCTION(x.s4, ptr + 4), __CLC_FUNCTION(x.s5, ptr + 5), \ + __CLC_FUNCTION(x.s6, ptr + 6), __CLC_FUNCTION(x.s7, ptr + 7), \ + __CLC_FUNCTION(x.s8, ptr + 8), __CLC_FUNCTION(x.s9, ptr + 9), \ + __CLC_FUNCTION(x.sa, ptr + 10), __CLC_FUNCTION(x.sb, ptr + 11), \ + __CLC_FUNCTION(x.sc, ptr + 12), __CLC_FUNCTION(x.sd, ptr + 13), \ + __CLC_FUNCTION(x.se, ptr + 14), __CLC_FUNCTION(x.sf, ptr + 15)); \ } #endif // __CLC_CLCMACRO_H__ diff --git a/libclc/clc/include/clc/common/clc_degrees.h b/libclc/clc/include/clc/common/clc_degrees.h index e633344c291c..5f04fac1f261 100644 --- a/libclc/clc/include/clc/common/clc_degrees.h +++ b/libclc/clc/include/clc/common/clc_degrees.h @@ -10,10 +10,10 @@ #define __CLC_COMMON_CLC_DEGREES_H__ #define __CLC_BODY -#define FUNCTION __clc_degrees +#define __CLC_FUNCTION __clc_degrees #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_COMMON_CLC_DEGREES_H__ diff --git a/libclc/clc/include/clc/common/clc_radians.h b/libclc/clc/include/clc/common/clc_radians.h index 0b075efbcce2..03d7c4b9e301 100644 --- a/libclc/clc/include/clc/common/clc_radians.h +++ b/libclc/clc/include/clc/common/clc_radians.h @@ -10,10 +10,10 @@ #define __CLC_COMMON_CLC_RADIANS_H__ #define __CLC_BODY -#define FUNCTION __clc_radians +#define __CLC_FUNCTION __clc_radians #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_COMMON_CLC_RADIANS_H__ diff --git a/libclc/clc/include/clc/common/clc_sign.h b/libclc/clc/include/clc/common/clc_sign.h index 0a8c4cc94d80..9737f1e54441 100644 --- a/libclc/clc/include/clc/common/clc_sign.h +++ b/libclc/clc/include/clc/common/clc_sign.h @@ -9,11 +9,11 @@ #ifndef __CLC_COMMON_CLC_SIGN_H__ #define __CLC_COMMON_CLC_SIGN_H__ -#define FUNCTION __clc_sign +#define __CLC_FUNCTION __clc_sign #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_COMMON_CLC_SIGN_H__ diff --git a/libclc/clc/include/clc/common/clc_step.h b/libclc/clc/include/clc/common/clc_step.h index 96b4759cabf7..6b093d06896c 100644 --- a/libclc/clc/include/clc/common/clc_step.h +++ b/libclc/clc/include/clc/common/clc_step.h @@ -9,11 +9,11 @@ #ifndef __CLC_COMMON_CLC_STEP_H__ #define __CLC_COMMON_CLC_STEP_H__ -#define FUNCTION __clc_step +#define __CLC_FUNCTION __clc_step #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_COMMON_CLC_STEP_H__ diff --git a/libclc/clc/include/clc/geometric/binary_decl.inc b/libclc/clc/include/clc/geometric/binary_decl.inc index 4a4235ae44db..ab64c9bece32 100644 --- a/libclc/clc/include/clc/geometric/binary_decl.inc +++ b/libclc/clc/include/clc/geometric/binary_decl.inc @@ -11,6 +11,6 @@ __CLC_VECSIZE_OR_1 == 3 || __CLC_VECSIZE_OR_1 == 4) _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_SCALAR_GENTYPE -FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b); +__CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b); #endif diff --git a/libclc/clc/include/clc/geometric/binary_def.inc b/libclc/clc/include/clc/geometric/binary_def.inc index ad4fa4c01683..bbd4f5798002 100644 --- a/libclc/clc/include/clc/geometric/binary_def.inc +++ b/libclc/clc/include/clc/geometric/binary_def.inc @@ -8,17 +8,17 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif // Geometric functions are only defined for scalar, vec2, vec3 and vec4 #if (__CLC_VECSIZE_OR_1 == 1 || __CLC_VECSIZE_OR_1 == 2 || \ __CLC_VECSIZE_OR_1 == 3 || __CLC_VECSIZE_OR_1 == 4) -_CLC_OVERLOAD _CLC_DEF __CLC_SCALAR_GENTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_GENTYPE b) { - return __IMPL_FUNCTION(FUNCTION)(a, b); +_CLC_OVERLOAD _CLC_DEF __CLC_SCALAR_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_GENTYPE b) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a, b); } #endif diff --git a/libclc/clc/include/clc/geometric/clc_distance.h b/libclc/clc/include/clc/geometric/clc_distance.h index 666e2c881bc9..1acea411a08b 100644 --- a/libclc/clc/include/clc/geometric/clc_distance.h +++ b/libclc/clc/include/clc/geometric/clc_distance.h @@ -9,11 +9,11 @@ #ifndef __CLC_GEOMETRIC_CLC_DISTANCE_H__ #define __CLC_GEOMETRIC_CLC_DISTANCE_H__ -#define FUNCTION __clc_distance +#define __CLC_FUNCTION __clc_distance #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_GEOMETRIC_CLC_DISTANCE_H__ diff --git a/libclc/clc/include/clc/geometric/clc_dot.h b/libclc/clc/include/clc/geometric/clc_dot.h index 4bfed6e3abc5..a5a97d1bdd2b 100644 --- a/libclc/clc/include/clc/geometric/clc_dot.h +++ b/libclc/clc/include/clc/geometric/clc_dot.h @@ -9,11 +9,11 @@ #ifndef __CLC_GEOMETRIC_CLC_DOT_H__ #define __CLC_GEOMETRIC_CLC_DOT_H__ -#define FUNCTION __clc_dot +#define __CLC_FUNCTION __clc_dot #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_GEOMETRIC_CLC_DOT_H__ diff --git a/libclc/clc/include/clc/geometric/clc_fast_distance.h b/libclc/clc/include/clc/geometric/clc_fast_distance.h index 2932d0147a6d..47292130e9d6 100644 --- a/libclc/clc/include/clc/geometric/clc_fast_distance.h +++ b/libclc/clc/include/clc/geometric/clc_fast_distance.h @@ -9,12 +9,12 @@ #ifndef __CLC_GEOMETRIC_CLC_FAST_DISTANCE_H__ #define __CLC_GEOMETRIC_CLC_FAST_DISTANCE_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_fast_distance +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_fast_distance #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_GEOMETRIC_CLC_FAST_DISTANCE_H__ diff --git a/libclc/clc/include/clc/geometric/clc_fast_length.h b/libclc/clc/include/clc/geometric/clc_fast_length.h index 903ecc8b9175..2244d8c77fea 100644 --- a/libclc/clc/include/clc/geometric/clc_fast_length.h +++ b/libclc/clc/include/clc/geometric/clc_fast_length.h @@ -9,12 +9,12 @@ #ifndef __CLC_GEOMETRIC_CLC_FAST_LENGTH_H__ #define __CLC_GEOMETRIC_CLC_FAST_LENGTH_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_fast_length +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_fast_length #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_GEOMETRIC_CLC_FAST_LENGTH_H__ diff --git a/libclc/clc/include/clc/geometric/clc_fast_normalize.h b/libclc/clc/include/clc/geometric/clc_fast_normalize.h index a91862fe61cb..ee9747c1af1e 100644 --- a/libclc/clc/include/clc/geometric/clc_fast_normalize.h +++ b/libclc/clc/include/clc/geometric/clc_fast_normalize.h @@ -9,13 +9,13 @@ #ifndef __CLC_GEOMETRIC_CLC_FAST_NORMALIZE_H__ #define __CLC_GEOMETRIC_CLC_FAST_NORMALIZE_H__ -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_GEOMETRIC_RET_GENTYPE -#define FUNCTION __clc_fast_normalize +#define __CLC_FUNCTION __clc_fast_normalize #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #undef __CLC_GEOMETRIC_RET_GENTYPE #endif // __CLC_GEOMETRIC_CLC_FAST_NORMALIZE_H__ diff --git a/libclc/clc/include/clc/geometric/clc_length.h b/libclc/clc/include/clc/geometric/clc_length.h index 35b206e6c3fb..f8ce12db0787 100644 --- a/libclc/clc/include/clc/geometric/clc_length.h +++ b/libclc/clc/include/clc/geometric/clc_length.h @@ -9,11 +9,11 @@ #ifndef __CLC_GEOMETRIC_CLC_LENGTH_H__ #define __CLC_GEOMETRIC_CLC_LENGTH_H__ -#define FUNCTION __clc_length +#define __CLC_FUNCTION __clc_length #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_GEOMETRIC_CLC_LENGTH_H__ diff --git a/libclc/clc/include/clc/geometric/clc_normalize.h b/libclc/clc/include/clc/geometric/clc_normalize.h index 745ed7f4887d..3058a72b2bbb 100644 --- a/libclc/clc/include/clc/geometric/clc_normalize.h +++ b/libclc/clc/include/clc/geometric/clc_normalize.h @@ -10,11 +10,11 @@ #define __CLC_GEOMETRIC_CLC_NORMALIZE_H__ #define __CLC_GEOMETRIC_RET_GENTYPE -#define FUNCTION __clc_normalize +#define __CLC_FUNCTION __clc_normalize #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #undef __CLC_GEOMETRIC_RET_GENTYPE #endif // __CLC_GEOMETRIC_CLC_NORMALIZE_H__ diff --git a/libclc/clc/include/clc/geometric/unary_decl.inc b/libclc/clc/include/clc/geometric/unary_decl.inc index bca1f3b78f17..6641d28d0967 100644 --- a/libclc/clc/include/clc/geometric/unary_decl.inc +++ b/libclc/clc/include/clc/geometric/unary_decl.inc @@ -16,6 +16,6 @@ _CLC_OVERLOAD _CLC_CONST _CLC_DECL #else __CLC_SCALAR_GENTYPE #endif - FUNCTION(__CLC_GENTYPE a); + __CLC_FUNCTION(__CLC_GENTYPE a); #endif diff --git a/libclc/clc/include/clc/geometric/unary_def.inc b/libclc/clc/include/clc/geometric/unary_def.inc index ea90de0ec581..78c144b35af2 100644 --- a/libclc/clc/include/clc/geometric/unary_def.inc +++ b/libclc/clc/include/clc/geometric/unary_def.inc @@ -8,8 +8,8 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif // Geometric functions are only defined for scalar, vec2, vec3 and vec4 @@ -22,8 +22,8 @@ _CLC_OVERLOAD _CLC_DEF #else __CLC_SCALAR_GENTYPE #endif - FUNCTION(__CLC_GENTYPE a) { - return __IMPL_FUNCTION(FUNCTION)(a); + __CLC_FUNCTION(__CLC_GENTYPE a) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a); } #endif diff --git a/libclc/clc/include/clc/integer/clc_add_sat.h b/libclc/clc/include/clc/integer/clc_add_sat.h index be095d38225e..8c3495f6c519 100644 --- a/libclc/clc/include/clc/integer/clc_add_sat.h +++ b/libclc/clc/include/clc/integer/clc_add_sat.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_ADD_SAT_H__ #define __CLC_INTEGER_CLC_ADD_SAT_H__ -#define FUNCTION __clc_add_sat +#define __CLC_FUNCTION __clc_add_sat #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_ADD_SAT_H__ diff --git a/libclc/clc/include/clc/integer/clc_bit_reverse.h b/libclc/clc/include/clc/integer/clc_bit_reverse.h index c945e326c74f..de0c354b3fa4 100644 --- a/libclc/clc/include/clc/integer/clc_bit_reverse.h +++ b/libclc/clc/include/clc/integer/clc_bit_reverse.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_BIT_REVERSE_H__ #define __CLC_INTEGER_CLC_BIT_REVERSE_H__ -#define FUNCTION __clc_bit_reverse +#define __CLC_FUNCTION __clc_bit_reverse #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_BIT_REVERSE_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc index b3f0e71a5f53..2f2366701120 100644 --- a/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc +++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_decl.inc @@ -6,5 +6,5 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE base, - uint offset, uint count); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_RETTYPE +__CLC_FUNCTION(__CLC_GENTYPE base, uint offset, uint count); diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h index 9c2e047b8be0..7998196512b3 100644 --- a/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h +++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_signed.h @@ -11,13 +11,13 @@ #include -#define FUNCTION __clc_bitfield_extract_signed -#define __RETTYPE __CLC_S_GENTYPE +#define __CLC_FUNCTION __clc_bitfield_extract_signed +#define __CLC_RETTYPE __CLC_S_GENTYPE #define __CLC_BODY #include -#undef __RETTYPE -#undef FUNCTION +#undef __CLC_RETTYPE +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h index 95305a3027e5..84614056417f 100644 --- a/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h +++ b/libclc/clc/include/clc/integer/clc_bitfield_extract_unsigned.h @@ -11,13 +11,13 @@ #include -#define FUNCTION __clc_bitfield_extract_unsigned -#define __RETTYPE __CLC_U_GENTYPE +#define __CLC_FUNCTION __clc_bitfield_extract_unsigned +#define __CLC_RETTYPE __CLC_U_GENTYPE #define __CLC_BODY #include -#undef __RETTYPE -#undef FUNCTION +#undef __CLC_RETTYPE +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_BITFIELD_EXTRACT_SIGNED_H__ diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.h b/libclc/clc/include/clc/integer/clc_bitfield_insert.h index f4d36b2ad2d2..b284c0bf932d 100644 --- a/libclc/clc/include/clc/integer/clc_bitfield_insert.h +++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.h @@ -11,7 +11,7 @@ #include -#define FUNCTION __clc_bitfield_insert +#define __CLC_FUNCTION __clc_bitfield_insert #define __CLC_BODY #include diff --git a/libclc/clc/include/clc/integer/clc_bitfield_insert.inc b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc index de0b3d946999..00a363fa4112 100644 --- a/libclc/clc/include/clc/integer/clc_bitfield_insert.inc +++ b/libclc/clc/include/clc/integer/clc_bitfield_insert.inc @@ -6,7 +6,5 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE base, - __CLC_GENTYPE insert, - uint offset, - uint count); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( + __CLC_GENTYPE base, __CLC_GENTYPE insert, uint offset, uint count); diff --git a/libclc/clc/include/clc/integer/clc_clz.h b/libclc/clc/include/clc/integer/clc_clz.h index 3e0530bad6f1..dd62cc935f8c 100644 --- a/libclc/clc/include/clc/integer/clc_clz.h +++ b/libclc/clc/include/clc/integer/clc_clz.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_CLZ_H__ #define __CLC_INTEGER_CLC_CLZ_H__ -#define FUNCTION __clc_clz +#define __CLC_FUNCTION __clc_clz #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_CLZ_H__ diff --git a/libclc/clc/include/clc/integer/clc_ctz.h b/libclc/clc/include/clc/integer/clc_ctz.h index 1495dc5560f1..3f2079e023f6 100644 --- a/libclc/clc/include/clc/integer/clc_ctz.h +++ b/libclc/clc/include/clc/integer/clc_ctz.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_CTZ_H__ #define __CLC_INTEGER_CLC_CTZ_H__ -#define FUNCTION __clc_ctz +#define __CLC_FUNCTION __clc_ctz #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_CTZ_H__ diff --git a/libclc/clc/include/clc/integer/clc_hadd.h b/libclc/clc/include/clc/integer/clc_hadd.h index 7aaee9b601a2..679369d3c9d4 100644 --- a/libclc/clc/include/clc/integer/clc_hadd.h +++ b/libclc/clc/include/clc/integer/clc_hadd.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_HADD_H__ #define __CLC_INTEGER_CLC_HADD_H__ -#define FUNCTION __clc_hadd +#define __CLC_FUNCTION __clc_hadd #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_HADD_H__ diff --git a/libclc/clc/include/clc/integer/clc_mad24.h b/libclc/clc/include/clc/integer/clc_mad24.h index 121840f81d63..e19bd905d22e 100644 --- a/libclc/clc/include/clc/integer/clc_mad24.h +++ b/libclc/clc/include/clc/integer/clc_mad24.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_MAD24_H__ #define __CLC_INTEGER_CLC_MAD24_H__ -#define FUNCTION __clc_mad24 +#define __CLC_FUNCTION __clc_mad24 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_MAD24_H__ diff --git a/libclc/clc/include/clc/integer/clc_mad_sat.h b/libclc/clc/include/clc/integer/clc_mad_sat.h index 0c418068ca4a..845378b3e7c4 100644 --- a/libclc/clc/include/clc/integer/clc_mad_sat.h +++ b/libclc/clc/include/clc/integer/clc_mad_sat.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_MAD_SAT_H__ #define __CLC_INTEGER_CLC_MAD_SAT_H__ -#define FUNCTION __clc_mad_sat +#define __CLC_FUNCTION __clc_mad_sat #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_MAD_SAT_H__ diff --git a/libclc/clc/include/clc/integer/clc_mul24.h b/libclc/clc/include/clc/integer/clc_mul24.h index 9310458d5981..f813de55c71e 100644 --- a/libclc/clc/include/clc/integer/clc_mul24.h +++ b/libclc/clc/include/clc/integer/clc_mul24.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_MUL24_H__ #define __CLC_INTEGER_CLC_MUL24_H__ -#define FUNCTION __clc_mul24 +#define __CLC_FUNCTION __clc_mul24 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_MUL24_H__ diff --git a/libclc/clc/include/clc/integer/clc_mul_hi.h b/libclc/clc/include/clc/integer/clc_mul_hi.h index 6542f237d09b..e4738e5570e8 100644 --- a/libclc/clc/include/clc/integer/clc_mul_hi.h +++ b/libclc/clc/include/clc/integer/clc_mul_hi.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_MUL_HI_H__ #define __CLC_INTEGER_CLC_MUL_HI_H__ -#define FUNCTION __clc_mul_hi +#define __CLC_FUNCTION __clc_mul_hi #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_MUL_HI_H__ diff --git a/libclc/clc/include/clc/integer/clc_popcount.h b/libclc/clc/include/clc/integer/clc_popcount.h index 8ece2e998862..55ca6ccfcec2 100644 --- a/libclc/clc/include/clc/integer/clc_popcount.h +++ b/libclc/clc/include/clc/integer/clc_popcount.h @@ -9,12 +9,12 @@ #ifndef __CLC_INTEGER_CLC_POPCOUNT_H__ #define __CLC_INTEGER_CLC_POPCOUNT_H__ -#define FUNCTION __clc_popcount +#define __CLC_FUNCTION __clc_popcount #define __CLC_BODY #include #undef __CLC_INTRINSIC -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_POPCOUNT_H__ diff --git a/libclc/clc/include/clc/integer/clc_rhadd.h b/libclc/clc/include/clc/integer/clc_rhadd.h index 882f950656df..b8b0b1a3a56e 100644 --- a/libclc/clc/include/clc/integer/clc_rhadd.h +++ b/libclc/clc/include/clc/integer/clc_rhadd.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_RHADD_H__ #define __CLC_INTEGER_CLC_RHADD_H__ -#define FUNCTION __clc_rhadd +#define __CLC_FUNCTION __clc_rhadd #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_RHADD_H__ diff --git a/libclc/clc/include/clc/integer/clc_rotate.h b/libclc/clc/include/clc/integer/clc_rotate.h index 61cf08788ef3..513bf9852623 100644 --- a/libclc/clc/include/clc/integer/clc_rotate.h +++ b/libclc/clc/include/clc/integer/clc_rotate.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_ROTATE_H__ #define __CLC_INTEGER_CLC_ROTATE_H__ -#define FUNCTION __clc_rotate +#define __CLC_FUNCTION __clc_rotate #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_ROTATE_H__ diff --git a/libclc/clc/include/clc/integer/clc_sub_sat.h b/libclc/clc/include/clc/integer/clc_sub_sat.h index 1b694a4a1780..94a5484adefc 100644 --- a/libclc/clc/include/clc/integer/clc_sub_sat.h +++ b/libclc/clc/include/clc/integer/clc_sub_sat.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTEGER_CLC_SUB_SAT_H__ #define __CLC_INTEGER_CLC_SUB_SAT_H__ -#define FUNCTION __clc_sub_sat +#define __CLC_FUNCTION __clc_sub_sat #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTEGER_CLC_SUB_SAT_H__ diff --git a/libclc/clc/include/clc/internal/math/clc_sw_fma.h b/libclc/clc/include/clc/internal/math/clc_sw_fma.h index 8497f4efd113..5d6c76879ceb 100644 --- a/libclc/clc/include/clc/internal/math/clc_sw_fma.h +++ b/libclc/clc/include/clc/internal/math/clc_sw_fma.h @@ -9,11 +9,11 @@ #ifndef __CLC_INTERNAL_MATH_CLC_SW_FMA_H__ #define __CLC_INTERNAL_MATH_CLC_SW_FMA_H__ -#define FUNCTION __clc_sw_fma +#define __CLC_FUNCTION __clc_sw_fma #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_INTERNAL_MATH_CLC_SW_FMA_H__ diff --git a/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc b/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc index 45f39b73013b..b9acf8c4af33 100644 --- a/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc +++ b/libclc/clc/include/clc/math/binary_decl_with_scalar_second_arg.inc @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_GENTYPE b); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b); +__CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE a, __CLC_SCALAR_GENTYPE b); diff --git a/libclc/clc/include/clc/math/binary_def_via_fp32.inc b/libclc/clc/include/clc/math/binary_def_via_fp32.inc index c8cdb1e9e5c7..ae22903dd3d0 100644 --- a/libclc/clc/include/clc/math/binary_def_via_fp32.inc +++ b/libclc/clc/include/clc/math/binary_def_via_fp32.inc @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - __CLC_GENTYPE y) { +_CLC_OVERLOAD _CLC_CONST _CLC_DEF __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y) { return __CLC_CONVERT_GENTYPE( - FUNCTION(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y))); + __CLC_FUNCTION(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y))); } diff --git a/libclc/clc/include/clc/math/clc_acos.h b/libclc/clc/include/clc/math/clc_acos.h index 130e3111ec1c..1d484fa5b56c 100644 --- a/libclc/clc/include/clc/math/clc_acos.h +++ b/libclc/clc/include/clc/math/clc_acos.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ACOS_H__ #define __CLC_BODY -#define FUNCTION __clc_acos +#define __CLC_FUNCTION __clc_acos #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ACOS_H__ diff --git a/libclc/clc/include/clc/math/clc_acosh.h b/libclc/clc/include/clc/math/clc_acosh.h index 9b8c7184b974..9492724bb27b 100644 --- a/libclc/clc/include/clc/math/clc_acosh.h +++ b/libclc/clc/include/clc/math/clc_acosh.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ACOSH_H__ #define __CLC_BODY -#define FUNCTION __clc_acosh +#define __CLC_FUNCTION __clc_acosh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ACOSH_H__ diff --git a/libclc/clc/include/clc/math/clc_acospi.h b/libclc/clc/include/clc/math/clc_acospi.h index 6d4dc694900a..90cfc06b7dff 100644 --- a/libclc/clc/include/clc/math/clc_acospi.h +++ b/libclc/clc/include/clc/math/clc_acospi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ACOSPI_H__ #define __CLC_BODY -#define FUNCTION __clc_acospi +#define __CLC_FUNCTION __clc_acospi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ACOSPI_H__ diff --git a/libclc/clc/include/clc/math/clc_asin.h b/libclc/clc/include/clc/math/clc_asin.h index f3d69a7bf995..f6246c49f802 100644 --- a/libclc/clc/include/clc/math/clc_asin.h +++ b/libclc/clc/include/clc/math/clc_asin.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ASIN_H__ #define __CLC_BODY -#define FUNCTION __clc_asin +#define __CLC_FUNCTION __clc_asin #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ASIN_H__ diff --git a/libclc/clc/include/clc/math/clc_asinh.h b/libclc/clc/include/clc/math/clc_asinh.h index dc4530fa68b8..05e44fa438ce 100644 --- a/libclc/clc/include/clc/math/clc_asinh.h +++ b/libclc/clc/include/clc/math/clc_asinh.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ASINH_H__ #define __CLC_BODY -#define FUNCTION __clc_asinh +#define __CLC_FUNCTION __clc_asinh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ASINH_H__ diff --git a/libclc/clc/include/clc/math/clc_asinpi.h b/libclc/clc/include/clc/math/clc_asinpi.h index 19d8ec3ad0c5..8f308dd06022 100644 --- a/libclc/clc/include/clc/math/clc_asinpi.h +++ b/libclc/clc/include/clc/math/clc_asinpi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ASINPI_H__ #define __CLC_BODY -#define FUNCTION __clc_asinpi +#define __CLC_FUNCTION __clc_asinpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ASINPI_H__ diff --git a/libclc/clc/include/clc/math/clc_atan.h b/libclc/clc/include/clc/math/clc_atan.h index 9f10efeae976..203e46c2b8a4 100644 --- a/libclc/clc/include/clc/math/clc_atan.h +++ b/libclc/clc/include/clc/math/clc_atan.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ATAN_H__ #define __CLC_BODY -#define FUNCTION __clc_atan +#define __CLC_FUNCTION __clc_atan #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ATAN_H__ diff --git a/libclc/clc/include/clc/math/clc_atan2.h b/libclc/clc/include/clc/math/clc_atan2.h index 99bd0760f3fe..6edc9dda48f4 100644 --- a/libclc/clc/include/clc/math/clc_atan2.h +++ b/libclc/clc/include/clc/math/clc_atan2.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ATAN2_H__ #define __CLC_BODY -#define FUNCTION __clc_atan2 +#define __CLC_FUNCTION __clc_atan2 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ATAN2_H__ diff --git a/libclc/clc/include/clc/math/clc_atan2pi.h b/libclc/clc/include/clc/math/clc_atan2pi.h index ae044b7bbfaf..2f26d1d9ae98 100644 --- a/libclc/clc/include/clc/math/clc_atan2pi.h +++ b/libclc/clc/include/clc/math/clc_atan2pi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ATAN2PI_H__ #define __CLC_BODY -#define FUNCTION __clc_atan2pi +#define __CLC_FUNCTION __clc_atan2pi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ATAN2PI_H__ diff --git a/libclc/clc/include/clc/math/clc_atanh.h b/libclc/clc/include/clc/math/clc_atanh.h index c1a95d4acba1..85f9fc53be1a 100644 --- a/libclc/clc/include/clc/math/clc_atanh.h +++ b/libclc/clc/include/clc/math/clc_atanh.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ATANH_H__ #define __CLC_BODY -#define FUNCTION __clc_atanh +#define __CLC_FUNCTION __clc_atanh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ATANH_H__ diff --git a/libclc/clc/include/clc/math/clc_atanpi.h b/libclc/clc/include/clc/math/clc_atanpi.h index 43f752ecfe98..2b3c6b373ed4 100644 --- a/libclc/clc/include/clc/math/clc_atanpi.h +++ b/libclc/clc/include/clc/math/clc_atanpi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ATANPI_H__ #define __CLC_BODY -#define FUNCTION __clc_atanpi +#define __CLC_FUNCTION __clc_atanpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ATANPI_H__ diff --git a/libclc/clc/include/clc/math/clc_cbrt.inc b/libclc/clc/include/clc/math/clc_cbrt.inc index 019b5174a901..fd9b750c8fd8 100644 --- a/libclc/clc/include/clc/math/clc_cbrt.inc +++ b/libclc/clc/include/clc/math/clc_cbrt.inc @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_CBRT_H__ #define __CLC_BODY -#define FUNCTION __clc_cbrt +#define __CLC_FUNCTION __clc_cbrt #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_CBRT_H__ diff --git a/libclc/clc/include/clc/math/clc_ceil.h b/libclc/clc/include/clc/math/clc_ceil.h index c60b794c83c6..45668224c617 100644 --- a/libclc/clc/include/clc/math/clc_ceil.h +++ b/libclc/clc/include/clc/math/clc_ceil.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_CEIL_H__ #define __CLC_BODY -#define FUNCTION __clc_ceil +#define __CLC_FUNCTION __clc_ceil #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_CEIL_H__ diff --git a/libclc/clc/include/clc/math/clc_copysign.h b/libclc/clc/include/clc/math/clc_copysign.h index 5976e42de250..563dd2827839 100644 --- a/libclc/clc/include/clc/math/clc_copysign.h +++ b/libclc/clc/include/clc/math/clc_copysign.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_COPYSIGN_H__ #define __CLC_BODY -#define FUNCTION __clc_copysign +#define __CLC_FUNCTION __clc_copysign #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_COPYSIGN_H__ diff --git a/libclc/clc/include/clc/math/clc_cos.h b/libclc/clc/include/clc/math/clc_cos.h index 3a1ae9342e26..44681608efc3 100644 --- a/libclc/clc/include/clc/math/clc_cos.h +++ b/libclc/clc/include/clc/math/clc_cos.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_COS_H__ #define __CLC_BODY -#define FUNCTION __clc_cos +#define __CLC_FUNCTION __clc_cos #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_COS_H__ diff --git a/libclc/clc/include/clc/math/clc_cosh.h b/libclc/clc/include/clc/math/clc_cosh.h index 3033d76000dc..7bcfa2ea26e0 100644 --- a/libclc/clc/include/clc/math/clc_cosh.h +++ b/libclc/clc/include/clc/math/clc_cosh.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_COSH_H__ #define __CLC_BODY -#define FUNCTION __clc_cosh +#define __CLC_FUNCTION __clc_cosh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_COSH_H__ diff --git a/libclc/clc/include/clc/math/clc_cospi.h b/libclc/clc/include/clc/math/clc_cospi.h index 3e4e73f096a0..4d82100649b0 100644 --- a/libclc/clc/include/clc/math/clc_cospi.h +++ b/libclc/clc/include/clc/math/clc_cospi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_COSPI_H__ #define __CLC_BODY -#define FUNCTION __clc_cospi +#define __CLC_FUNCTION __clc_cospi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_COSPI_H__ diff --git a/libclc/clc/include/clc/math/clc_erf.h b/libclc/clc/include/clc/math/clc_erf.h index 04b990c65133..01a21b36b352 100644 --- a/libclc/clc/include/clc/math/clc_erf.h +++ b/libclc/clc/include/clc/math/clc_erf.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ERF_H__ #define __CLC_BODY -#define FUNCTION __clc_erf +#define __CLC_FUNCTION __clc_erf #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ERF_H__ diff --git a/libclc/clc/include/clc/math/clc_erfc.h b/libclc/clc/include/clc/math/clc_erfc.h index be2578b4bfb0..efd581542879 100644 --- a/libclc/clc/include/clc/math/clc_erfc.h +++ b/libclc/clc/include/clc/math/clc_erfc.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ERFC_H__ #define __CLC_BODY -#define FUNCTION __clc_erfc +#define __CLC_FUNCTION __clc_erfc #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ERFC_H__ diff --git a/libclc/clc/include/clc/math/clc_exp.h b/libclc/clc/include/clc/math/clc_exp.h index 355148934427..84369bb8fbdd 100644 --- a/libclc/clc/include/clc/math/clc_exp.h +++ b/libclc/clc/include/clc/math/clc_exp.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_EXP_H__ #define __CLC_BODY -#define FUNCTION __clc_exp +#define __CLC_FUNCTION __clc_exp #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_EXP_H__ diff --git a/libclc/clc/include/clc/math/clc_exp10.h b/libclc/clc/include/clc/math/clc_exp10.h index c281ee84dd3a..500271cb0742 100644 --- a/libclc/clc/include/clc/math/clc_exp10.h +++ b/libclc/clc/include/clc/math/clc_exp10.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_EXP10_H__ #define __CLC_BODY -#define FUNCTION __clc_exp10 +#define __CLC_FUNCTION __clc_exp10 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_EXP10_H__ diff --git a/libclc/clc/include/clc/math/clc_exp2.h b/libclc/clc/include/clc/math/clc_exp2.h index bb10f12d34e2..4ad0ddea48a7 100644 --- a/libclc/clc/include/clc/math/clc_exp2.h +++ b/libclc/clc/include/clc/math/clc_exp2.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_EXP2_H__ #define __CLC_BODY -#define FUNCTION __clc_exp2 +#define __CLC_FUNCTION __clc_exp2 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_EXP2_H__ diff --git a/libclc/clc/include/clc/math/clc_exp_helper.h b/libclc/clc/include/clc/math/clc_exp_helper.h index 0c5028bd2385..a2b3f1bebaf4 100644 --- a/libclc/clc/include/clc/math/clc_exp_helper.h +++ b/libclc/clc/include/clc/math/clc_exp_helper.h @@ -9,7 +9,7 @@ #ifndef __CLC_MATH_CLC_EXP_HELPER #define __CLC_MATH_CLC_EXP_HELPER -#define __DOUBLE_ONLY +#define __CLC_DOUBLE_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/include/clc/math/clc_expm1.h b/libclc/clc/include/clc/math/clc_expm1.h index 7a199dca17cb..5c60960ca5b0 100644 --- a/libclc/clc/include/clc/math/clc_expm1.h +++ b/libclc/clc/include/clc/math/clc_expm1.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_EXPM1_H__ #define __CLC_BODY -#define FUNCTION __clc_expm1 +#define __CLC_FUNCTION __clc_expm1 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_EXPM1_H__ diff --git a/libclc/clc/include/clc/math/clc_fabs.h b/libclc/clc/include/clc/math/clc_fabs.h index 8af3bb572142..1ef38a8c7595 100644 --- a/libclc/clc/include/clc/math/clc_fabs.h +++ b/libclc/clc/include/clc/math/clc_fabs.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_FABS_H__ #define __CLC_BODY -#define FUNCTION __clc_fabs +#define __CLC_FUNCTION __clc_fabs #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FABS_H__ diff --git a/libclc/clc/include/clc/math/clc_fdim.h b/libclc/clc/include/clc/math/clc_fdim.h index c678eb25f59b..50ad045db62c 100644 --- a/libclc/clc/include/clc/math/clc_fdim.h +++ b/libclc/clc/include/clc/math/clc_fdim.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_FDIM_H__ #define __CLC_BODY -#define FUNCTION __clc_fdim +#define __CLC_FUNCTION __clc_fdim #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FDIM_H__ diff --git a/libclc/clc/include/clc/math/clc_floor.h b/libclc/clc/include/clc/math/clc_floor.h index b3fe8ea95ce1..d243819e0a42 100644 --- a/libclc/clc/include/clc/math/clc_floor.h +++ b/libclc/clc/include/clc/math/clc_floor.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_FLOOR_H__ #define __CLC_BODY -#define FUNCTION __clc_floor +#define __CLC_FUNCTION __clc_floor #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FLOOR_H__ diff --git a/libclc/clc/include/clc/math/clc_fma.h b/libclc/clc/include/clc/math/clc_fma.h index efe20bd150eb..5e87f4c47085 100644 --- a/libclc/clc/include/clc/math/clc_fma.h +++ b/libclc/clc/include/clc/math/clc_fma.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_FMA_H__ #define __CLC_MATH_CLC_FMA_H__ -#define FUNCTION __clc_fma +#define __CLC_FUNCTION __clc_fma #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FMA_H__ diff --git a/libclc/clc/include/clc/math/clc_fmax.h b/libclc/clc/include/clc/math/clc_fmax.h index fd5d38987ec1..f367a9b6ea1c 100644 --- a/libclc/clc/include/clc/math/clc_fmax.h +++ b/libclc/clc/include/clc/math/clc_fmax.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_FMAX_H__ #define __CLC_MATH_CLC_FMAX_H__ -#define FUNCTION __clc_fmax +#define __CLC_FUNCTION __clc_fmax #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FMAX_H__ diff --git a/libclc/clc/include/clc/math/clc_fmin.h b/libclc/clc/include/clc/math/clc_fmin.h index 9fcdb56475ff..47ed3fad2d8c 100644 --- a/libclc/clc/include/clc/math/clc_fmin.h +++ b/libclc/clc/include/clc/math/clc_fmin.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_FMIN_H__ #define __CLC_MATH_CLC_FMIN_H__ -#define FUNCTION __clc_fmin +#define __CLC_FUNCTION __clc_fmin #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FMIN_H__ diff --git a/libclc/clc/include/clc/math/clc_fmod.h b/libclc/clc/include/clc/math/clc_fmod.h index 840bb850d73c..44179babd7fe 100644 --- a/libclc/clc/include/clc/math/clc_fmod.h +++ b/libclc/clc/include/clc/math/clc_fmod.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_FMOD_H__ #define __CLC_MATH_CLC_FMOD_H__ -#define FUNCTION __clc_fmod +#define __CLC_FUNCTION __clc_fmod #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FMOD_H__ diff --git a/libclc/clc/include/clc/math/clc_fract.h b/libclc/clc/include/clc/math/clc_fract.h index 26c402c79641..af762ecc59ea 100644 --- a/libclc/clc/include/clc/math/clc_fract.h +++ b/libclc/clc/include/clc/math/clc_fract.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_FRACT_H__ #define __CLC_MATH_CLC_FRACT_H__ -#define FUNCTION __clc_fract +#define __CLC_FUNCTION __clc_fract #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FRACT_H__ diff --git a/libclc/clc/include/clc/math/clc_frexp.h b/libclc/clc/include/clc/math/clc_frexp.h index e3c9f6b363dd..d6ed2754d0d4 100644 --- a/libclc/clc/include/clc/math/clc_frexp.h +++ b/libclc/clc/include/clc/math/clc_frexp.h @@ -9,10 +9,10 @@ #ifndef __CLC_MATH_CLC_FREXP_H__ #define __CLC_MATH_CLC_FREXP_H__ -#define FUNCTION __clc_frexp +#define __CLC_FUNCTION __clc_frexp #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_FREXP_H__ diff --git a/libclc/clc/include/clc/math/clc_half_cos.h b/libclc/clc/include/clc/math/clc_half_cos.h index 0f9f0b61443f..d0122987899c 100644 --- a/libclc/clc/include/clc/math/clc_half_cos.h +++ b/libclc/clc/include/clc/math/clc_half_cos.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_COS_H__ #define __CLC_MATH_CLC_HALF_COS_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_cos +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_cos #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_COS_H__ diff --git a/libclc/clc/include/clc/math/clc_half_divide.h b/libclc/clc/include/clc/math/clc_half_divide.h index 44d90f39b4f6..e5d5cdc9df34 100644 --- a/libclc/clc/include/clc/math/clc_half_divide.h +++ b/libclc/clc/include/clc/math/clc_half_divide.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_DIVIDE_H__ #define __CLC_MATH_CLC_HALF_DIVIDE_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_divide +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_divide #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_DIVIDE_H__ diff --git a/libclc/clc/include/clc/math/clc_half_exp.h b/libclc/clc/include/clc/math/clc_half_exp.h index 158d28c16aab..2284eaf8df36 100644 --- a/libclc/clc/include/clc/math/clc_half_exp.h +++ b/libclc/clc/include/clc/math/clc_half_exp.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_EXP_H__ #define __CLC_MATH_CLC_HALF_EXP_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_exp +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_exp #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_EXP_H__ diff --git a/libclc/clc/include/clc/math/clc_half_exp10.h b/libclc/clc/include/clc/math/clc_half_exp10.h index 4982b618e505..662df93e0685 100644 --- a/libclc/clc/include/clc/math/clc_half_exp10.h +++ b/libclc/clc/include/clc/math/clc_half_exp10.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_EXP10_H__ #define __CLC_MATH_CLC_HALF_EXP10_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_exp10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_exp10 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_EXP10_H__ diff --git a/libclc/clc/include/clc/math/clc_half_exp2.h b/libclc/clc/include/clc/math/clc_half_exp2.h index dc0f67659bf9..d2af0f2e3fa9 100644 --- a/libclc/clc/include/clc/math/clc_half_exp2.h +++ b/libclc/clc/include/clc/math/clc_half_exp2.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_EXP2_H__ #define __CLC_MATH_CLC_HALF_EXP2_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_exp2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_exp2 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_EXP2_H__ diff --git a/libclc/clc/include/clc/math/clc_half_log.h b/libclc/clc/include/clc/math/clc_half_log.h index df527e1a8fb5..8ae30e5abe5f 100644 --- a/libclc/clc/include/clc/math/clc_half_log.h +++ b/libclc/clc/include/clc/math/clc_half_log.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_LOG_H__ #define __CLC_MATH_CLC_HALF_LOG_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_log +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_log #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_LOG_H__ diff --git a/libclc/clc/include/clc/math/clc_half_log10.h b/libclc/clc/include/clc/math/clc_half_log10.h index 47db4cd32593..fe5248350aba 100644 --- a/libclc/clc/include/clc/math/clc_half_log10.h +++ b/libclc/clc/include/clc/math/clc_half_log10.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_LOG10_H__ #define __CLC_MATH_CLC_HALF_LOG10_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_log10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_log10 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_LOG10_H__ diff --git a/libclc/clc/include/clc/math/clc_half_log2.h b/libclc/clc/include/clc/math/clc_half_log2.h index 2b95b444b3f2..c57aa1bffb33 100644 --- a/libclc/clc/include/clc/math/clc_half_log2.h +++ b/libclc/clc/include/clc/math/clc_half_log2.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_LOG2_H__ #define __CLC_MATH_CLC_HALF_LOG2_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_log2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_log2 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_LOG2_H__ diff --git a/libclc/clc/include/clc/math/clc_half_powr.h b/libclc/clc/include/clc/math/clc_half_powr.h index 2fd62213b824..188b32b8b49c 100644 --- a/libclc/clc/include/clc/math/clc_half_powr.h +++ b/libclc/clc/include/clc/math/clc_half_powr.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_POWR_H__ #define __CLC_MATH_CLC_HALF_POWR_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_powr +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_powr #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_POWR_H__ diff --git a/libclc/clc/include/clc/math/clc_half_recip.h b/libclc/clc/include/clc/math/clc_half_recip.h index d2ace8b7a17e..f5f71fa1988c 100644 --- a/libclc/clc/include/clc/math/clc_half_recip.h +++ b/libclc/clc/include/clc/math/clc_half_recip.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_RECIP_H__ #define __CLC_MATH_CLC_HALF_RECIP_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_recip +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_recip #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_RECIP_H__ diff --git a/libclc/clc/include/clc/math/clc_half_rsqrt.h b/libclc/clc/include/clc/math/clc_half_rsqrt.h index a50dabccdaaf..d5a05ac8b84a 100644 --- a/libclc/clc/include/clc/math/clc_half_rsqrt.h +++ b/libclc/clc/include/clc/math/clc_half_rsqrt.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_RSQRT_H__ #define __CLC_MATH_CLC_HALF_RSQRT_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_rsqrt #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_RSQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_half_sin.h b/libclc/clc/include/clc/math/clc_half_sin.h index f79f6afc2b4e..d2d3da9b11db 100644 --- a/libclc/clc/include/clc/math/clc_half_sin.h +++ b/libclc/clc/include/clc/math/clc_half_sin.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_SIN_H__ #define __CLC_MATH_CLC_HALF_SIN_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_sin +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_sin #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_SIN_H__ diff --git a/libclc/clc/include/clc/math/clc_half_sqrt.h b/libclc/clc/include/clc/math/clc_half_sqrt.h index df5fc222a5c9..a54b8122cb15 100644 --- a/libclc/clc/include/clc/math/clc_half_sqrt.h +++ b/libclc/clc/include/clc/math/clc_half_sqrt.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_SQRT_H__ #define __CLC_MATH_CLC_HALF_SQRT_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_sqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_sqrt #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_SQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_half_tan.h b/libclc/clc/include/clc/math/clc_half_tan.h index 0f6ef8790722..503ebe29c3ce 100644 --- a/libclc/clc/include/clc/math/clc_half_tan.h +++ b/libclc/clc/include/clc/math/clc_half_tan.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_HALF_TAN_H__ #define __CLC_MATH_CLC_HALF_TAN_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_half_tan +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_tan #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HALF_TAN_H__ diff --git a/libclc/clc/include/clc/math/clc_hypot.h b/libclc/clc/include/clc/math/clc_hypot.h index b2d69ebda96e..40b354154fa5 100644 --- a/libclc/clc/include/clc/math/clc_hypot.h +++ b/libclc/clc/include/clc/math/clc_hypot.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_HYPOT_H__ #define __CLC_BODY -#define FUNCTION __clc_hypot +#define __CLC_FUNCTION __clc_hypot #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_HYPOT_H__ diff --git a/libclc/clc/include/clc/math/clc_ilogb.h b/libclc/clc/include/clc/math/clc_ilogb.h index e02108ff1444..6aeaf6be3ac6 100644 --- a/libclc/clc/include/clc/math/clc_ilogb.h +++ b/libclc/clc/include/clc/math/clc_ilogb.h @@ -9,10 +9,10 @@ #ifndef __CLC_MATH_CLC_ILOGB_H__ #define __CLC_MATH_CLC_ILOGB_H__ -#define FUNCTION __clc_ilogb +#define __CLC_FUNCTION __clc_ilogb #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ILOGB_H__ diff --git a/libclc/clc/include/clc/math/clc_lgamma.h b/libclc/clc/include/clc/math/clc_lgamma.h index 69b1ff9701ca..f54c5e2b61b6 100644 --- a/libclc/clc/include/clc/math/clc_lgamma.h +++ b/libclc/clc/include/clc/math/clc_lgamma.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_LGAMMA_H__ #define __CLC_BODY -#define FUNCTION __clc_lgamma +#define __CLC_FUNCTION __clc_lgamma #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LGAMMA_H__ diff --git a/libclc/clc/include/clc/math/clc_lgamma_r.h b/libclc/clc/include/clc/math/clc_lgamma_r.h index a98f760d8032..262e62372227 100644 --- a/libclc/clc/include/clc/math/clc_lgamma_r.h +++ b/libclc/clc/include/clc/math/clc_lgamma_r.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_LGAMMA_R_H__ #define __CLC_MATH_CLC_LGAMMA_R_H__ -#define FUNCTION __clc_lgamma_r +#define __CLC_FUNCTION __clc_lgamma_r #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LGAMMA_R_H__ diff --git a/libclc/clc/include/clc/math/clc_log.h b/libclc/clc/include/clc/math/clc_log.h index 6bddb5ff746e..8b0e44b425a8 100644 --- a/libclc/clc/include/clc/math/clc_log.h +++ b/libclc/clc/include/clc/math/clc_log.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_LOG_H__ #define __CLC_MATH_CLC_LOG_H__ -#define FUNCTION __clc_log +#define __CLC_FUNCTION __clc_log #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LOG_H__ diff --git a/libclc/clc/include/clc/math/clc_log10.h b/libclc/clc/include/clc/math/clc_log10.h index e3a5b19eb418..6e086690c6ad 100644 --- a/libclc/clc/include/clc/math/clc_log10.h +++ b/libclc/clc/include/clc/math/clc_log10.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_LOG10_H__ #define __CLC_MATH_CLC_LOG10_H__ -#define FUNCTION __clc_log10 +#define __CLC_FUNCTION __clc_log10 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LOG10_H__ diff --git a/libclc/clc/include/clc/math/clc_log1p.h b/libclc/clc/include/clc/math/clc_log1p.h index f502802dafb2..1be8c8548d48 100644 --- a/libclc/clc/include/clc/math/clc_log1p.h +++ b/libclc/clc/include/clc/math/clc_log1p.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_LOG1P_H__ #define __CLC_BODY -#define FUNCTION __clc_log1p +#define __CLC_FUNCTION __clc_log1p #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LOG1P_H__ diff --git a/libclc/clc/include/clc/math/clc_log2.h b/libclc/clc/include/clc/math/clc_log2.h index 7344cabaeffa..c5f9e37f3984 100644 --- a/libclc/clc/include/clc/math/clc_log2.h +++ b/libclc/clc/include/clc/math/clc_log2.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_LOG2_H__ #define __CLC_MATH_CLC_LOG2_H__ -#define FUNCTION __clc_log2 +#define __CLC_FUNCTION __clc_log2 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LOG2_H__ diff --git a/libclc/clc/include/clc/math/clc_logb.h b/libclc/clc/include/clc/math/clc_logb.h index 1339d5819171..75a2ce7e578b 100644 --- a/libclc/clc/include/clc/math/clc_logb.h +++ b/libclc/clc/include/clc/math/clc_logb.h @@ -9,10 +9,10 @@ #ifndef __CLC_MATH_CLC_LOGB_H__ #define __CLC_MATH_CLC_LOGB_H__ -#define FUNCTION __clc_logb +#define __CLC_FUNCTION __clc_logb #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_LOGB_H__ diff --git a/libclc/clc/include/clc/math/clc_mad.h b/libclc/clc/include/clc/math/clc_mad.h index 72014b7c0ef7..117e361380ed 100644 --- a/libclc/clc/include/clc/math/clc_mad.h +++ b/libclc/clc/include/clc/math/clc_mad.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_MAD_H__ #define __CLC_BODY -#define FUNCTION __clc_mad +#define __CLC_FUNCTION __clc_mad #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_MAD_H__ diff --git a/libclc/clc/include/clc/math/clc_maxmag.h b/libclc/clc/include/clc/math/clc_maxmag.h index 058230e6ebfc..857ad7867b05 100644 --- a/libclc/clc/include/clc/math/clc_maxmag.h +++ b/libclc/clc/include/clc/math/clc_maxmag.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_MAXMAG_H__ #define __CLC_BODY -#define FUNCTION __clc_maxmag +#define __CLC_FUNCTION __clc_maxmag #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_MAXMAG_H__ diff --git a/libclc/clc/include/clc/math/clc_minmag.h b/libclc/clc/include/clc/math/clc_minmag.h index 581b95782823..9615d029bfed 100644 --- a/libclc/clc/include/clc/math/clc_minmag.h +++ b/libclc/clc/include/clc/math/clc_minmag.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_MINMAG_H__ #define __CLC_BODY -#define FUNCTION __clc_minmag +#define __CLC_FUNCTION __clc_minmag #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_MINMAG_H__ diff --git a/libclc/clc/include/clc/math/clc_modf.h b/libclc/clc/include/clc/math/clc_modf.h index fbdd3f8ffd4b..aa8f9e20892a 100644 --- a/libclc/clc/include/clc/math/clc_modf.h +++ b/libclc/clc/include/clc/math/clc_modf.h @@ -9,10 +9,10 @@ #ifndef __CLC_MATH_CLC_MODF_H__ #define __CLC_MATH_CLC_MODF_H__ -#define FUNCTION __clc_modf +#define __CLC_FUNCTION __clc_modf #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_MODF_H__ diff --git a/libclc/clc/include/clc/math/clc_nan.h b/libclc/clc/include/clc/math/clc_nan.h index 45d91184cf14..91901fd9369f 100644 --- a/libclc/clc/include/clc/math/clc_nan.h +++ b/libclc/clc/include/clc/math/clc_nan.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_NAN_H__ #define __CLC_MATH_CLC_NAN_H__ -#define FUNCTION __clc_nan +#define __CLC_FUNCTION __clc_nan #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NAN_H__ diff --git a/libclc/clc/include/clc/math/clc_native_cos.h b/libclc/clc/include/clc/math/clc_native_cos.h index 08d51baca1f6..eefd133911e4 100644 --- a/libclc/clc/include/clc/math/clc_native_cos.h +++ b/libclc/clc/include/clc/math/clc_native_cos.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_COS_H__ #define __CLC_MATH_CLC_NATIVE_COS_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_cos +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_cos #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_COS_H__ diff --git a/libclc/clc/include/clc/math/clc_native_divide.h b/libclc/clc/include/clc/math/clc_native_divide.h index a10ed512f10d..97d97c2670a6 100644 --- a/libclc/clc/include/clc/math/clc_native_divide.h +++ b/libclc/clc/include/clc/math/clc_native_divide.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_DIVIDE_H__ #define __CLC_MATH_CLC_NATIVE_DIVIDE_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_divide +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_divide #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_DIVIDE_H__ diff --git a/libclc/clc/include/clc/math/clc_native_exp.h b/libclc/clc/include/clc/math/clc_native_exp.h index 30801fe2f8ad..30c0d1edf18f 100644 --- a/libclc/clc/include/clc/math/clc_native_exp.h +++ b/libclc/clc/include/clc/math/clc_native_exp.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_EXP_H__ #define __CLC_MATH_CLC_NATIVE_EXP_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_exp +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_EXP_H__ diff --git a/libclc/clc/include/clc/math/clc_native_exp10.h b/libclc/clc/include/clc/math/clc_native_exp10.h index 91cf60ef6a2b..a88b5a389737 100644 --- a/libclc/clc/include/clc/math/clc_native_exp10.h +++ b/libclc/clc/include/clc/math/clc_native_exp10.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_EXP10_H__ #define __CLC_MATH_CLC_NATIVE_EXP10_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_exp10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp10 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_EXP10_H__ diff --git a/libclc/clc/include/clc/math/clc_native_exp2.h b/libclc/clc/include/clc/math/clc_native_exp2.h index fe4728053619..09f4bfe6979d 100644 --- a/libclc/clc/include/clc/math/clc_native_exp2.h +++ b/libclc/clc/include/clc/math/clc_native_exp2.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_EXP2_H__ #define __CLC_MATH_CLC_NATIVE_EXP2_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_exp2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp2 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_EXP2_H__ diff --git a/libclc/clc/include/clc/math/clc_native_log.h b/libclc/clc/include/clc/math/clc_native_log.h index 25a7054d2a59..c32fc812d477 100644 --- a/libclc/clc/include/clc/math/clc_native_log.h +++ b/libclc/clc/include/clc/math/clc_native_log.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_LOG_H__ #define __CLC_MATH_CLC_NATIVE_LOG_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_log +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_LOG_H__ diff --git a/libclc/clc/include/clc/math/clc_native_log10.h b/libclc/clc/include/clc/math/clc_native_log10.h index d46124d655ee..329b8a743755 100644 --- a/libclc/clc/include/clc/math/clc_native_log10.h +++ b/libclc/clc/include/clc/math/clc_native_log10.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_LOG10_H__ #define __CLC_MATH_CLC_NATIVE_LOG10_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_log10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log10 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_LOG10_H__ diff --git a/libclc/clc/include/clc/math/clc_native_log2.h b/libclc/clc/include/clc/math/clc_native_log2.h index 8998553b4984..b8ad663b470d 100644 --- a/libclc/clc/include/clc/math/clc_native_log2.h +++ b/libclc/clc/include/clc/math/clc_native_log2.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_LOG2_H__ #define __CLC_MATH_CLC_NATIVE_LOG2_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_log2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log2 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_LOG2_H__ diff --git a/libclc/clc/include/clc/math/clc_native_powr.h b/libclc/clc/include/clc/math/clc_native_powr.h index 932cfa66aeb9..3a0c8cc8b9d5 100644 --- a/libclc/clc/include/clc/math/clc_native_powr.h +++ b/libclc/clc/include/clc/math/clc_native_powr.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_POWR_H__ #define __CLC_MATH_CLC_NATIVE_POWR_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_powr +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_powr #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_POWR_H__ diff --git a/libclc/clc/include/clc/math/clc_native_recip.h b/libclc/clc/include/clc/math/clc_native_recip.h index 6b94cbc3d6c6..a6540e846f2b 100644 --- a/libclc/clc/include/clc/math/clc_native_recip.h +++ b/libclc/clc/include/clc/math/clc_native_recip.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_RECIP_H__ #define __CLC_MATH_CLC_NATIVE_RECIP_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_recip +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_recip #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_RECIP_H__ diff --git a/libclc/clc/include/clc/math/clc_native_rsqrt.h b/libclc/clc/include/clc/math/clc_native_rsqrt.h index 5fa870b3f868..0c8f3da03929 100644 --- a/libclc/clc/include/clc/math/clc_native_rsqrt.h +++ b/libclc/clc/include/clc/math/clc_native_rsqrt.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_RSQRT_H__ #define __CLC_MATH_CLC_NATIVE_RSQRT_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_rsqrt #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_RSQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_native_sin.h b/libclc/clc/include/clc/math/clc_native_sin.h index d25968c74409..22ab7868e20f 100644 --- a/libclc/clc/include/clc/math/clc_native_sin.h +++ b/libclc/clc/include/clc/math/clc_native_sin.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_SIN_H__ #define __CLC_MATH_CLC_NATIVE_SIN_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_sin +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_sin #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_SIN_H__ diff --git a/libclc/clc/include/clc/math/clc_native_sqrt.h b/libclc/clc/include/clc/math/clc_native_sqrt.h index e8a28b7f7046..fd071f1d463a 100644 --- a/libclc/clc/include/clc/math/clc_native_sqrt.h +++ b/libclc/clc/include/clc/math/clc_native_sqrt.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_SQRT_H__ #define __CLC_MATH_CLC_NATIVE_SQRT_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_sqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_sqrt #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_SQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_native_tan.h b/libclc/clc/include/clc/math/clc_native_tan.h index 36fdd6d8c011..4b381312d8d5 100644 --- a/libclc/clc/include/clc/math/clc_native_tan.h +++ b/libclc/clc/include/clc/math/clc_native_tan.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_NATIVE_TAN_H__ #define __CLC_MATH_CLC_NATIVE_TAN_H__ -#define __FLOAT_ONLY -#define FUNCTION __clc_native_tan +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_tan #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NATIVE_TAN_H__ diff --git a/libclc/clc/include/clc/math/clc_nextafter.h b/libclc/clc/include/clc/math/clc_nextafter.h index d080a40a06f0..f24742b9e9c3 100644 --- a/libclc/clc/include/clc/math/clc_nextafter.h +++ b/libclc/clc/include/clc/math/clc_nextafter.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_NEXTAFTER_H__ #define __CLC_BODY -#define FUNCTION __clc_nextafter +#define __CLC_FUNCTION __clc_nextafter #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_NEXTAFTER_H__ diff --git a/libclc/clc/include/clc/math/clc_pow.h b/libclc/clc/include/clc/math/clc_pow.h index b33391e10a42..5e37e5bf6da6 100644 --- a/libclc/clc/include/clc/math/clc_pow.h +++ b/libclc/clc/include/clc/math/clc_pow.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_POW_H__ #define __CLC_BODY -#define FUNCTION __clc_pow +#define __CLC_FUNCTION __clc_pow #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_POW_H__ diff --git a/libclc/clc/include/clc/math/clc_pown.h b/libclc/clc/include/clc/math/clc_pown.h index 473bbe1603c5..30628efb1900 100644 --- a/libclc/clc/include/clc/math/clc_pown.h +++ b/libclc/clc/include/clc/math/clc_pown.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_POWN_H__ #define __CLC_BODY -#define FUNCTION __clc_pown +#define __CLC_FUNCTION __clc_pown #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_POWN_H__ diff --git a/libclc/clc/include/clc/math/clc_powr.h b/libclc/clc/include/clc/math/clc_powr.h index a820516f1866..baa494cce698 100644 --- a/libclc/clc/include/clc/math/clc_powr.h +++ b/libclc/clc/include/clc/math/clc_powr.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_POWR_H__ #define __CLC_BODY -#define FUNCTION __clc_powr +#define __CLC_FUNCTION __clc_powr #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_POWR_H__ diff --git a/libclc/clc/include/clc/math/clc_remainder.h b/libclc/clc/include/clc/math/clc_remainder.h index bc53b043045c..a7700c95103b 100644 --- a/libclc/clc/include/clc/math/clc_remainder.h +++ b/libclc/clc/include/clc/math/clc_remainder.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_REMAINDER_H__ #define __CLC_MATH_CLC_REMAINDER_H__ -#define FUNCTION __clc_remainder +#define __CLC_FUNCTION __clc_remainder #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_REMAINDER_H__ diff --git a/libclc/clc/include/clc/math/clc_remquo.h b/libclc/clc/include/clc/math/clc_remquo.h index b46f94ba9447..48a8844a6e38 100644 --- a/libclc/clc/include/clc/math/clc_remquo.h +++ b/libclc/clc/include/clc/math/clc_remquo.h @@ -9,11 +9,11 @@ #ifndef __CLC_MATH_CLC_REMQUO_H__ #define __CLC_MATH_CLC_REMQUO_H__ -#define FUNCTION __clc_remquo +#define __CLC_FUNCTION __clc_remquo #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_REMQUO_H__ diff --git a/libclc/clc/include/clc/math/clc_rint.h b/libclc/clc/include/clc/math/clc_rint.h index c446c7576471..17b1549f439b 100644 --- a/libclc/clc/include/clc/math/clc_rint.h +++ b/libclc/clc/include/clc/math/clc_rint.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_RINT_H__ #define __CLC_BODY -#define FUNCTION __clc_rint +#define __CLC_FUNCTION __clc_rint #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_RINT_H__ diff --git a/libclc/clc/include/clc/math/clc_rootn.h b/libclc/clc/include/clc/math/clc_rootn.h index fb7817ee250f..90a25ad52d86 100644 --- a/libclc/clc/include/clc/math/clc_rootn.h +++ b/libclc/clc/include/clc/math/clc_rootn.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ROOTN_H__ #define __CLC_BODY -#define FUNCTION __clc_rootn +#define __CLC_FUNCTION __clc_rootn #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ROOTN_H__ diff --git a/libclc/clc/include/clc/math/clc_round.h b/libclc/clc/include/clc/math/clc_round.h index fd1872a5cf22..78ebf4db99e9 100644 --- a/libclc/clc/include/clc/math/clc_round.h +++ b/libclc/clc/include/clc/math/clc_round.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_ROUND_H__ #define __CLC_BODY -#define FUNCTION __clc_round +#define __CLC_FUNCTION __clc_round #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_ROUND_H__ diff --git a/libclc/clc/include/clc/math/clc_rsqrt.h b/libclc/clc/include/clc/math/clc_rsqrt.h index 1690ede1b892..59402fbead2b 100644 --- a/libclc/clc/include/clc/math/clc_rsqrt.h +++ b/libclc/clc/include/clc/math/clc_rsqrt.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_RSQRT_H__ #define __CLC_BODY -#define FUNCTION __clc_rsqrt +#define __CLC_FUNCTION __clc_rsqrt #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_RSQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_sin.h b/libclc/clc/include/clc/math/clc_sin.h index 89ae444eb6e3..de4c722ca123 100644 --- a/libclc/clc/include/clc/math/clc_sin.h +++ b/libclc/clc/include/clc/math/clc_sin.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_SIN_H__ #define __CLC_BODY -#define FUNCTION __clc_sin +#define __CLC_FUNCTION __clc_sin #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_SIN_H__ diff --git a/libclc/clc/include/clc/math/clc_sincos.h b/libclc/clc/include/clc/math/clc_sincos.h index bf3e0806db38..e26dc7c024c9 100644 --- a/libclc/clc/include/clc/math/clc_sincos.h +++ b/libclc/clc/include/clc/math/clc_sincos.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_SINCOS_H__ #define __CLC_BODY -#define FUNCTION __clc_sincos +#define __CLC_FUNCTION __clc_sincos #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_SINCOS_H__ diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.h b/libclc/clc/include/clc/math/clc_sincos_helpers.h index f9ceba3bf2cf..909afe476ad9 100644 --- a/libclc/clc/include/clc/math/clc_sincos_helpers.h +++ b/libclc/clc/include/clc/math/clc_sincos_helpers.h @@ -9,12 +9,12 @@ #ifndef __CLC_MATH_CLC_SINCOS_HELPERS_H__ #define __CLC_MATH_CLC_SINCOS_HELPERS_H__ -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include -#define __DOUBLE_ONLY +#define __CLC_DOUBLE_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/include/clc/math/clc_sinh.h b/libclc/clc/include/clc/math/clc_sinh.h index c94d8e37acc6..fa9cabe79cb0 100644 --- a/libclc/clc/include/clc/math/clc_sinh.h +++ b/libclc/clc/include/clc/math/clc_sinh.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_SINH_H__ #define __CLC_BODY -#define FUNCTION __clc_sinh +#define __CLC_FUNCTION __clc_sinh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_SINH_H__ diff --git a/libclc/clc/include/clc/math/clc_sinpi.h b/libclc/clc/include/clc/math/clc_sinpi.h index 725e559600ed..db563c2e7e14 100644 --- a/libclc/clc/include/clc/math/clc_sinpi.h +++ b/libclc/clc/include/clc/math/clc_sinpi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_SINPI_H__ #define __CLC_BODY -#define FUNCTION __clc_sinpi +#define __CLC_FUNCTION __clc_sinpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_SINPI_H__ diff --git a/libclc/clc/include/clc/math/clc_sqrt.h b/libclc/clc/include/clc/math/clc_sqrt.h index 74aea8e85809..708451553329 100644 --- a/libclc/clc/include/clc/math/clc_sqrt.h +++ b/libclc/clc/include/clc/math/clc_sqrt.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_SQRT_H__ #define __CLC_BODY -#define FUNCTION __clc_sqrt +#define __CLC_FUNCTION __clc_sqrt #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_SQRT_H__ diff --git a/libclc/clc/include/clc/math/clc_tan.h b/libclc/clc/include/clc/math/clc_tan.h index d16370420a03..028ff28ecd69 100644 --- a/libclc/clc/include/clc/math/clc_tan.h +++ b/libclc/clc/include/clc/math/clc_tan.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_TAN_H__ #define __CLC_BODY -#define FUNCTION __clc_tan +#define __CLC_FUNCTION __clc_tan #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_TAN_H__ diff --git a/libclc/clc/include/clc/math/clc_tanh.h b/libclc/clc/include/clc/math/clc_tanh.h index 2c860c165ca6..88fce2340d99 100644 --- a/libclc/clc/include/clc/math/clc_tanh.h +++ b/libclc/clc/include/clc/math/clc_tanh.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_TANH_H__ #define __CLC_BODY -#define FUNCTION __clc_tanh +#define __CLC_FUNCTION __clc_tanh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_TANH_H__ diff --git a/libclc/clc/include/clc/math/clc_tanpi.h b/libclc/clc/include/clc/math/clc_tanpi.h index 2691fa696c0a..596bb72b4b8d 100644 --- a/libclc/clc/include/clc/math/clc_tanpi.h +++ b/libclc/clc/include/clc/math/clc_tanpi.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_TANPI_H__ #define __CLC_BODY -#define FUNCTION __clc_tanpi +#define __CLC_FUNCTION __clc_tanpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_TANPI_H__ diff --git a/libclc/clc/include/clc/math/clc_tgamma.h b/libclc/clc/include/clc/math/clc_tgamma.h index d067703c2e9d..dd76e041875d 100644 --- a/libclc/clc/include/clc/math/clc_tgamma.h +++ b/libclc/clc/include/clc/math/clc_tgamma.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_TGAMMA_H__ #define __CLC_BODY -#define FUNCTION __clc_tgamma +#define __CLC_FUNCTION __clc_tgamma #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_TGAMMA_H__ diff --git a/libclc/clc/include/clc/math/clc_trunc.h b/libclc/clc/include/clc/math/clc_trunc.h index 6a8353a4f6d4..33445f9fe15d 100644 --- a/libclc/clc/include/clc/math/clc_trunc.h +++ b/libclc/clc/include/clc/math/clc_trunc.h @@ -10,10 +10,10 @@ #define __CLC_MATH_CLC_TRUNC_H__ #define __CLC_BODY -#define FUNCTION __clc_trunc +#define __CLC_FUNCTION __clc_trunc #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MATH_CLC_TRUNC_H__ diff --git a/libclc/clc/include/clc/math/gentype.inc b/libclc/clc/include/clc/math/gentype.inc index 299ce6633cf0..3373f5fa5002 100644 --- a/libclc/clc/include/clc/math/gentype.inc +++ b/libclc/clc/include/clc/math/gentype.inc @@ -70,7 +70,7 @@ #define __CLC_CONVERT_S_GENTYPE __CLC_XCONCAT(__clc_convert_, __CLC_S_GENTYPE) #define __CLC_CONVERT_U_GENTYPE __CLC_XCONCAT(__clc_convert_, __CLC_U_GENTYPE) -#if (!defined(__HALF_ONLY) && !defined(__DOUBLE_ONLY)) +#if (!defined(__CLC_HALF_ONLY) && !defined(__CLC_DOUBLE_ONLY)) #define __CLC_SCALAR_GENTYPE float #define __CLC_FPSIZE 32 #define __CLC_FP_LIT(x) (__CLC_GENTYPE) x##F @@ -145,7 +145,7 @@ #endif -#if (!defined(__HALF_ONLY) && !defined(__FLOAT_ONLY)) +#if (!defined(__CLC_HALF_ONLY) && !defined(__CLC_FLOAT_ONLY)) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -223,7 +223,7 @@ #endif #endif -#if (!defined(__FLOAT_ONLY) && !defined(__DOUBLE_ONLY)) +#if (!defined(__CLC_FLOAT_ONLY) && !defined(__CLC_DOUBLE_ONLY)) #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable @@ -356,6 +356,6 @@ #undef __CLC_AS_GENTYPE #undef __CLC_CONVERT_GENTYPE -#undef __HALF_ONLY -#undef __FLOAT_ONLY -#undef __DOUBLE_ONLY +#undef __CLC_HALF_ONLY +#undef __CLC_FLOAT_ONLY +#undef __CLC_DOUBLE_ONLY diff --git a/libclc/clc/include/clc/math/math.h b/libclc/clc/include/clc/math/math.h index 5df53f1b5e5d..c2647f66b400 100644 --- a/libclc/clc/include/clc/math/math.h +++ b/libclc/clc/include/clc/math/math.h @@ -120,6 +120,4 @@ _CLC_OVERLOAD _CLC_INLINE float __clc_flush_denormal_if_not_supported(float x) { #endif // cl_khr_fp16 -#define ALIGNED(x) __attribute__((aligned(x))) - #endif // __CLC_MATH_MATH_H__ diff --git a/libclc/clc/include/clc/math/remquo_decl.inc b/libclc/clc/include/clc/math/remquo_decl.inc index 24d96b048805..cba28a7244eb 100644 --- a/libclc/clc/include/clc/math/remquo_decl.inc +++ b/libclc/clc/include/clc/math/remquo_decl.inc @@ -6,15 +6,19 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y, - private __CLC_INTN *q); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + private __CLC_INTN *q); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y, - global __CLC_INTN *q); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + global __CLC_INTN *q); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y, - local __CLC_INTN *q); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + local __CLC_INTN *q); #if _CLC_GENERIC_AS_SUPPORTED -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y, - generic __CLC_INTN *q); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_GENTYPE y, + generic __CLC_INTN *q); #endif diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h index 0fec778b5367..ad4b0547f8c6 100644 --- a/libclc/clc/include/clc/math/tables.h +++ b/libclc/clc/include/clc/math/tables.h @@ -11,35 +11,36 @@ #include -#define TABLE_SPACE __constant +#define __CLC_TABLE_SPACE __constant -#define TABLE_MANGLE(NAME) __clc_##NAME +#define __CLC_TABLE_MANGLE(NAME) __clc_##NAME -#define DECLARE_TABLE(TYPE, NAME, LENGTH) TABLE_SPACE TYPE NAME[LENGTH] +#define __CLC_DECLARE_TABLE(TYPE, NAME, LENGTH) \ + __CLC_TABLE_SPACE TYPE NAME[LENGTH] -#define TABLE_FUNCTION(TYPE, TABLE, NAME) \ - TYPE TABLE_MANGLE(NAME)(size_t idx) { return TABLE[idx]; } +#define __CLC_TABLE_FUNCTION(TYPE, TABLE, NAME) \ + TYPE __CLC_TABLE_MANGLE(NAME)(size_t idx) { return TABLE[idx]; } -#define CLC_TABLE_FUNCTION(TYPE, TABLE, NAME) \ - _CLC_DEF _CLC_OVERLOAD TYPE TABLE_MANGLE(NAME)(int idx) { \ +#define __CLC_TABLE_FUNCTION_VEC(TYPE, TABLE, NAME) \ + _CLC_DEF _CLC_OVERLOAD TYPE __CLC_TABLE_MANGLE(NAME)(int idx) { \ return TABLE[idx]; \ } \ - _CLC_DEF _CLC_OVERLOAD TYPE##2 TABLE_MANGLE(NAME)(int##2 idx) { \ + _CLC_DEF _CLC_OVERLOAD TYPE##2 __CLC_TABLE_MANGLE(NAME)(int##2 idx) { \ return (TYPE##2){TABLE[idx.s0], TABLE[idx.s1]}; \ } \ - _CLC_DEF _CLC_OVERLOAD TYPE##3 TABLE_MANGLE(NAME)(int##3 idx) { \ + _CLC_DEF _CLC_OVERLOAD TYPE##3 __CLC_TABLE_MANGLE(NAME)(int##3 idx) { \ return (TYPE##3){TABLE[idx.s0], TABLE[idx.s1], TABLE[idx.s2]}; \ } \ - _CLC_DEF _CLC_OVERLOAD TYPE##4 TABLE_MANGLE(NAME)(int##4 idx) { \ + _CLC_DEF _CLC_OVERLOAD TYPE##4 __CLC_TABLE_MANGLE(NAME)(int##4 idx) { \ return (TYPE##4){TABLE[idx.s0], TABLE[idx.s1], TABLE[idx.s2], \ TABLE[idx.s3]}; \ } \ - _CLC_DEF _CLC_OVERLOAD TYPE##8 TABLE_MANGLE(NAME)(int##8 idx) { \ + _CLC_DEF _CLC_OVERLOAD TYPE##8 __CLC_TABLE_MANGLE(NAME)(int##8 idx) { \ return (TYPE##8){TABLE[idx.s0], TABLE[idx.s1], TABLE[idx.s2], \ TABLE[idx.s3], TABLE[idx.s4], TABLE[idx.s5], \ TABLE[idx.s6], TABLE[idx.s7]}; \ } \ - _CLC_DEF _CLC_OVERLOAD TYPE##16 TABLE_MANGLE(NAME)(int##16 idx) { \ + _CLC_DEF _CLC_OVERLOAD TYPE##16 __CLC_TABLE_MANGLE(NAME)(int##16 idx) { \ return (TYPE##16){ \ TABLE[idx.s0], TABLE[idx.s1], TABLE[idx.s2], TABLE[idx.s3], \ TABLE[idx.s4], TABLE[idx.s5], TABLE[idx.s6], TABLE[idx.s7], \ @@ -47,58 +48,59 @@ TABLE[idx.sC], TABLE[idx.sD], TABLE[idx.sE], TABLE[idx.sF]}; \ } -#define TABLE_FUNCTION_DECL(TYPE, NAME) TYPE TABLE_MANGLE(NAME)(size_t idx); +#define __CLC_TABLE_FUNCTION_DECL(TYPE, NAME) \ + TYPE __CLC_TABLE_MANGLE(NAME)(size_t idx); -#define CLC_TABLE_FUNCTION_DECL(TYPE, NAME) \ - _CLC_DECL _CLC_OVERLOAD TYPE TABLE_MANGLE(NAME)(int idx); \ - _CLC_DECL _CLC_OVERLOAD TYPE##2 TABLE_MANGLE(NAME)(int##2 idx); \ - _CLC_DECL _CLC_OVERLOAD TYPE##3 TABLE_MANGLE(NAME)(int##3 idx); \ - _CLC_DECL _CLC_OVERLOAD TYPE##4 TABLE_MANGLE(NAME)(int##4 idx); \ - _CLC_DECL _CLC_OVERLOAD TYPE##8 TABLE_MANGLE(NAME)(int##8 idx); \ - _CLC_DECL _CLC_OVERLOAD TYPE##16 TABLE_MANGLE(NAME)(int##16 idx); +#define __CLC_TABLE_FUNCTION_DECL_VEC(TYPE, NAME) \ + _CLC_DECL _CLC_OVERLOAD TYPE __CLC_TABLE_MANGLE(NAME)(int idx); \ + _CLC_DECL _CLC_OVERLOAD TYPE##2 __CLC_TABLE_MANGLE(NAME)(int##2 idx); \ + _CLC_DECL _CLC_OVERLOAD TYPE##3 __CLC_TABLE_MANGLE(NAME)(int##3 idx); \ + _CLC_DECL _CLC_OVERLOAD TYPE##4 __CLC_TABLE_MANGLE(NAME)(int##4 idx); \ + _CLC_DECL _CLC_OVERLOAD TYPE##8 __CLC_TABLE_MANGLE(NAME)(int##8 idx); \ + _CLC_DECL _CLC_OVERLOAD TYPE##16 __CLC_TABLE_MANGLE(NAME)(int##16 idx); -#define USE_TABLE(NAME, IDX) TABLE_MANGLE(NAME)(IDX) +#define __CLC_USE_TABLE(NAME, IDX) __CLC_TABLE_MANGLE(NAME)(IDX) -TABLE_FUNCTION_DECL(float2, log2_tbl); -TABLE_FUNCTION_DECL(float2, log10_tbl); +__CLC_TABLE_FUNCTION_DECL(float2, log2_tbl); +__CLC_TABLE_FUNCTION_DECL(float2, log10_tbl); -CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_head); -CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_tail); -CLC_TABLE_FUNCTION_DECL(float, loge_tbl_lo); -CLC_TABLE_FUNCTION_DECL(float, loge_tbl_hi); -CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl); -CLC_TABLE_FUNCTION_DECL(float, exp_tbl); -CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_head); -CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_tail); -CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_head); -CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_tail); -CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_head); -CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_tail); -CLC_TABLE_FUNCTION_DECL(ulong, pibits_tbl); +__CLC_TABLE_FUNCTION_DECL_VEC(float, log_inv_tbl_ep_head); +__CLC_TABLE_FUNCTION_DECL_VEC(float, log_inv_tbl_ep_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(float, loge_tbl_lo); +__CLC_TABLE_FUNCTION_DECL_VEC(float, loge_tbl_hi); +__CLC_TABLE_FUNCTION_DECL_VEC(float, log_inv_tbl); +__CLC_TABLE_FUNCTION_DECL_VEC(float, exp_tbl); +__CLC_TABLE_FUNCTION_DECL_VEC(float, exp_tbl_ep_head); +__CLC_TABLE_FUNCTION_DECL_VEC(float, exp_tbl_ep_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(float, cbrt_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(float, cbrt_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(float, sinhcosh_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(float, sinhcosh_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(ulong, pibits_tbl); #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -CLC_TABLE_FUNCTION_DECL(double, ln_tbl_lo); -CLC_TABLE_FUNCTION_DECL(double, ln_tbl_hi); -CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, two_to_jby64_ep_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, two_to_jby64_ep_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, sinh_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, sinh_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, cosh_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, cosh_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, cbrt_inv_tbl); -CLC_TABLE_FUNCTION_DECL(double, cbrt_dbl_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, cbrt_dbl_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, cbrt_rem_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, cbrt_rem_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, powlog_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, powlog_tbl_tail); -CLC_TABLE_FUNCTION_DECL(double, log_f_inv_tbl_head); -CLC_TABLE_FUNCTION_DECL(double, log_f_inv_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, ln_tbl_lo); +__CLC_TABLE_FUNCTION_DECL_VEC(double, ln_tbl_hi); +__CLC_TABLE_FUNCTION_DECL_VEC(double, atan_jby256_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, atan_jby256_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, two_to_jby64_ep_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, two_to_jby64_ep_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, sinh_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, sinh_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cosh_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cosh_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cbrt_inv_tbl); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cbrt_dbl_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cbrt_dbl_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cbrt_rem_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, cbrt_rem_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, powlog_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, powlog_tbl_tail); +__CLC_TABLE_FUNCTION_DECL_VEC(double, log_f_inv_tbl_head); +__CLC_TABLE_FUNCTION_DECL_VEC(double, log_f_inv_tbl_tail); #endif // cl_khr_fp64 diff --git a/libclc/clc/include/clc/math/unary_decl.inc b/libclc/clc/include/clc/math/unary_decl.inc index 46108c5fac9e..19f1c8a876e5 100644 --- a/libclc/clc/include/clc/math/unary_decl.inc +++ b/libclc/clc/include/clc/math/unary_decl.inc @@ -6,4 +6,5 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x); diff --git a/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc b/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc index d62046a5292c..8bfe17c3c681 100644 --- a/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc +++ b/libclc/clc/include/clc/math/unary_decl_with_int_ptr.inc @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - global __CLC_INTN *iptr); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - local __CLC_INTN *iptr); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - private __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + global __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + local __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + private __CLC_INTN *iptr); #if _CLC_GENERIC_AS_SUPPORTED -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - generic __CLC_INTN *iptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + generic __CLC_INTN *iptr); #endif diff --git a/libclc/clc/include/clc/math/unary_decl_with_int_return.inc b/libclc/clc/include/clc/math/unary_decl_with_int_return.inc index 64fcf4267da3..2e86a310b018 100644 --- a/libclc/clc/include/clc/math/unary_decl_with_int_return.inc +++ b/libclc/clc/include/clc/math/unary_decl_with_int_return.inc @@ -6,4 +6,4 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_INTN FUNCTION(__CLC_GENTYPE x); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_INTN __CLC_FUNCTION(__CLC_GENTYPE x); diff --git a/libclc/clc/include/clc/math/unary_decl_with_ptr.inc b/libclc/clc/include/clc/math/unary_decl_with_ptr.inc index ca0077bec558..63a20e3d21c5 100644 --- a/libclc/clc/include/clc/math/unary_decl_with_ptr.inc +++ b/libclc/clc/include/clc/math/unary_decl_with_ptr.inc @@ -6,14 +6,14 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - global __CLC_GENTYPE *ptr); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - local __CLC_GENTYPE *ptr); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - private __CLC_GENTYPE *ptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + global __CLC_GENTYPE *ptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + local __CLC_GENTYPE *ptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x, private __CLC_GENTYPE *ptr); #if _CLC_GENERIC_AS_SUPPORTED -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - generic __CLC_GENTYPE *ptr); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x, generic __CLC_GENTYPE *ptr); #endif diff --git a/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc b/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc index a8ed34ad04bb..db6472ad0e64 100644 --- a/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc +++ b/libclc/clc/include/clc/math/unary_def_with_int_ptr.inc @@ -8,28 +8,28 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - private __CLC_INTN *iptr) { - return __IMPL_FUNCTION(FUNCTION)(x, iptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + private __CLC_INTN *iptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, iptr); } -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - global __CLC_INTN *iptr) { - return __IMPL_FUNCTION(FUNCTION)(x, iptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + global __CLC_INTN *iptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, iptr); } -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - local __CLC_INTN *iptr) { - return __IMPL_FUNCTION(FUNCTION)(x, iptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + local __CLC_INTN *iptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, iptr); } #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - generic __CLC_INTN *iptr) { - return __IMPL_FUNCTION(FUNCTION)(x, iptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + generic __CLC_INTN *iptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, iptr); } #endif diff --git a/libclc/clc/include/clc/math/unary_def_with_int_return.inc b/libclc/clc/include/clc/math/unary_def_with_int_return.inc index c7bb84cf5f7c..7a47aae8ce63 100644 --- a/libclc/clc/include/clc/math/unary_def_with_int_return.inc +++ b/libclc/clc/include/clc/math/unary_def_with_int_return.inc @@ -8,10 +8,10 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_INTN FUNCTION(__CLC_GENTYPE a) { - return __IMPL_FUNCTION(FUNCTION)(a); +_CLC_OVERLOAD _CLC_DEF __CLC_INTN __CLC_FUNCTION(__CLC_GENTYPE a) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a); } diff --git a/libclc/clc/include/clc/math/unary_def_with_ptr.inc b/libclc/clc/include/clc/math/unary_def_with_ptr.inc index c3faa9aab87c..8a490e1bba16 100644 --- a/libclc/clc/include/clc/math/unary_def_with_ptr.inc +++ b/libclc/clc/include/clc/math/unary_def_with_ptr.inc @@ -8,28 +8,28 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - private __CLC_GENTYPE *ptr) { - return __IMPL_FUNCTION(FUNCTION)(x, ptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x, private __CLC_GENTYPE *ptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, ptr); } -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - global __CLC_GENTYPE *ptr) { - return __IMPL_FUNCTION(FUNCTION)(x, ptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + global __CLC_GENTYPE *ptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, ptr); } -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - local __CLC_GENTYPE *ptr) { - return __IMPL_FUNCTION(FUNCTION)(x, ptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + local __CLC_GENTYPE *ptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, ptr); } #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - generic __CLC_GENTYPE *ptr) { - return __IMPL_FUNCTION(FUNCTION)(x, ptr); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x, generic __CLC_GENTYPE *ptr) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, ptr); } #endif diff --git a/libclc/clc/include/clc/misc/clc_shuffle.h b/libclc/clc/include/clc/misc/clc_shuffle.h index 1e9ec6828f95..ba3d7a2bddd6 100644 --- a/libclc/clc/include/clc/misc/clc_shuffle.h +++ b/libclc/clc/include/clc/misc/clc_shuffle.h @@ -9,7 +9,7 @@ #ifndef __CLC_MISC_CLC_SHUFFLE_H__ #define __CLC_MISC_CLC_SHUFFLE_H__ -#define FUNCTION __clc_shuffle +#define __CLC_FUNCTION __clc_shuffle // Integer-type decls #define __CLC_BODY @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MISC_CLC_SHUFFLE_H__ diff --git a/libclc/clc/include/clc/misc/clc_shuffle2.h b/libclc/clc/include/clc/misc/clc_shuffle2.h index fb5361cfeb95..84bafba12b21 100644 --- a/libclc/clc/include/clc/misc/clc_shuffle2.h +++ b/libclc/clc/include/clc/misc/clc_shuffle2.h @@ -9,7 +9,7 @@ #ifndef __CLC_MISC_CLC_SHUFFLE2_H__ #define __CLC_MISC_CLC_SHUFFLE2_H__ -#define FUNCTION __clc_shuffle2 +#define __CLC_FUNCTION __clc_shuffle2 // Integer-type decls #define __CLC_BODY @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_MISC_CLC_SHUFFLE2_H__ diff --git a/libclc/clc/include/clc/misc/shuffle2_decl.inc b/libclc/clc/include/clc/misc/shuffle2_decl.inc index 3504e7876277..37f83cba9939 100644 --- a/libclc/clc/include/clc/misc/shuffle2_decl.inc +++ b/libclc/clc/include/clc/misc/shuffle2_decl.inc @@ -13,16 +13,16 @@ // size as the mask. Elements in the mask must be the same size (number of bits) // as the input value., e.g. char8 ret = shuffle(char2 x, uchar8 mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) y, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) y, __CLC_U_GENTYPE mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) y, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) y, __CLC_U_GENTYPE mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) y, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) y, __CLC_U_GENTYPE mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) y, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) y, __CLC_U_GENTYPE mask); #endif diff --git a/libclc/clc/include/clc/misc/shuffle2_def.inc b/libclc/clc/include/clc/misc/shuffle2_def.inc index 0415b8c18729..f25d281af116 100644 --- a/libclc/clc/include/clc/misc/shuffle2_def.inc +++ b/libclc/clc/include/clc/misc/shuffle2_def.inc @@ -11,32 +11,32 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif // The return type is same base type as the input type, with the same vector // size as the mask. Elements in the mask must be the same size (number of bits) // as the input value., e.g. char8 ret = shuffle(char2 x, uchar8 mask); _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) y, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, y, mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) y, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask); } _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) y, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, y, mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) y, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask); } _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) y, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, y, mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) y, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask); } -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, - __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) y, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, y, mask); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) y, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y, mask); } #endif diff --git a/libclc/clc/include/clc/misc/shuffle_decl.inc b/libclc/clc/include/clc/misc/shuffle_decl.inc index 1445aaf74052..0056d12f0f42 100644 --- a/libclc/clc/include/clc/misc/shuffle_decl.inc +++ b/libclc/clc/include/clc/misc/shuffle_decl.inc @@ -13,12 +13,12 @@ // size as the mask. Elements in the mask must be the same size (number of bits) // as the input value., e.g. char8 ret = shuffle(char2 x, uchar8 mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, __CLC_U_GENTYPE mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, __CLC_U_GENTYPE mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, __CLC_U_GENTYPE mask); _CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, __CLC_U_GENTYPE mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, __CLC_U_GENTYPE mask); #endif diff --git a/libclc/clc/include/clc/misc/shuffle_def.inc b/libclc/clc/include/clc/misc/shuffle_def.inc index d2a088498fc6..49a47daf821f 100644 --- a/libclc/clc/include/clc/misc/shuffle_def.inc +++ b/libclc/clc/include/clc/misc/shuffle_def.inc @@ -11,28 +11,28 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif // The return type is same base type as the input type, with the same vector // size as the mask. Elements in the mask must be the same size (number of bits) // as the input value., e.g. char8 ret = shuffle(char2 x, uchar8 mask); _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 2) x, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask); } _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 4) x, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask); } _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, mask); +__CLC_FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 8) x, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask); } -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE -FUNCTION(__CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, __CLC_U_GENTYPE mask) { - return __IMPL_FUNCTION(FUNCTION)(x, mask); +_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( + __CLC_XCONCAT(__CLC_SCALAR_GENTYPE, 16) x, __CLC_U_GENTYPE mask) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, mask); } #endif diff --git a/libclc/clc/include/clc/relational/binary_decl.inc b/libclc/clc/include/clc/relational/binary_decl.inc index 87cbc8c37cdf..34749559c7e0 100644 --- a/libclc/clc/include/clc/relational/binary_decl.inc +++ b/libclc/clc/include/clc/relational/binary_decl.inc @@ -7,12 +7,12 @@ //===----------------------------------------------------------------------===// #if __CLC_VECSIZE_OR_1 == 1 -#define __RETTYPE __CLC_INTN +#define __CLC_RETTYPE __CLC_INTN #else -#define __RETTYPE __CLC_BIT_INTN +#define __CLC_RETTYPE __CLC_BIT_INTN #endif -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_GENTYPE b); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_RETTYPE +__CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b); -#undef __RETTYPE +#undef __CLC_RETTYPE diff --git a/libclc/clc/include/clc/relational/clc_isfinite.h b/libclc/clc/include/clc/relational/clc_isfinite.h index 444d73303981..596b81f51b85 100644 --- a/libclc/clc/include/clc/relational/clc_isfinite.h +++ b/libclc/clc/include/clc/relational/clc_isfinite.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISFINITE_H__ #define __CLC_RELATIONAL_CLC_ISFINITE_H__ -#define FUNCTION __clc_isfinite +#define __CLC_FUNCTION __clc_isfinite #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISFINITE_H__ diff --git a/libclc/clc/include/clc/relational/clc_isgreater.h b/libclc/clc/include/clc/relational/clc_isgreater.h index 88de46854961..fc5b3cdad084 100644 --- a/libclc/clc/include/clc/relational/clc_isgreater.h +++ b/libclc/clc/include/clc/relational/clc_isgreater.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISGREATER_H__ #define __CLC_RELATIONAL_CLC_ISGREATER_H__ -#define FUNCTION __clc_isgreater +#define __CLC_FUNCTION __clc_isgreater #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISGREATER_H__ diff --git a/libclc/clc/include/clc/relational/clc_isgreaterequal.h b/libclc/clc/include/clc/relational/clc_isgreaterequal.h index 42308036f102..6209a066cf46 100644 --- a/libclc/clc/include/clc/relational/clc_isgreaterequal.h +++ b/libclc/clc/include/clc/relational/clc_isgreaterequal.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__ #define __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__ -#define FUNCTION __clc_isgreaterequal +#define __CLC_FUNCTION __clc_isgreaterequal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISGREATEREQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isless.h b/libclc/clc/include/clc/relational/clc_isless.h index 6fdc6c54947c..4e340db12177 100644 --- a/libclc/clc/include/clc/relational/clc_isless.h +++ b/libclc/clc/include/clc/relational/clc_isless.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISLESS_H__ #define __CLC_RELATIONAL_CLC_ISLESS_H__ -#define FUNCTION __clc_isless +#define __CLC_FUNCTION __clc_isless #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISLESS_H__ diff --git a/libclc/clc/include/clc/relational/clc_islessequal.h b/libclc/clc/include/clc/relational/clc_islessequal.h index e592287b2309..14c5a1030183 100644 --- a/libclc/clc/include/clc/relational/clc_islessequal.h +++ b/libclc/clc/include/clc/relational/clc_islessequal.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__ #define __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__ -#define FUNCTION __clc_islessequal +#define __CLC_FUNCTION __clc_islessequal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISLESSEQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_islessgreater.h b/libclc/clc/include/clc/relational/clc_islessgreater.h index a2f10707a677..530564bbf059 100644 --- a/libclc/clc/include/clc/relational/clc_islessgreater.h +++ b/libclc/clc/include/clc/relational/clc_islessgreater.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISLESSGREATER_H__ #define __CLC_RELATIONAL_CLC_ISLESSGREATER_H__ -#define FUNCTION __clc_islessgreater +#define __CLC_FUNCTION __clc_islessgreater #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISLESSGREATER_H__ diff --git a/libclc/clc/include/clc/relational/clc_isnormal.h b/libclc/clc/include/clc/relational/clc_isnormal.h index 2281bc4245d0..9572b0200468 100644 --- a/libclc/clc/include/clc/relational/clc_isnormal.h +++ b/libclc/clc/include/clc/relational/clc_isnormal.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISNORMAL_H__ #define __CLC_RELATIONAL_CLC_ISNORMAL_H__ -#define FUNCTION __clc_isnormal +#define __CLC_FUNCTION __clc_isnormal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISNORMAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isnotequal.h b/libclc/clc/include/clc/relational/clc_isnotequal.h index c2640fc0899a..f90622db9fe1 100644 --- a/libclc/clc/include/clc/relational/clc_isnotequal.h +++ b/libclc/clc/include/clc/relational/clc_isnotequal.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__ #define __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__ -#define FUNCTION __clc_isnotequal +#define __CLC_FUNCTION __clc_isnotequal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISNOTEQUAL_H__ diff --git a/libclc/clc/include/clc/relational/clc_isordered.h b/libclc/clc/include/clc/relational/clc_isordered.h index cb9be3131157..3f5cc097f4a7 100644 --- a/libclc/clc/include/clc/relational/clc_isordered.h +++ b/libclc/clc/include/clc/relational/clc_isordered.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISORDERED_H__ #define __CLC_RELATIONAL_CLC_ISORDERED_H__ -#define FUNCTION __clc_isordered +#define __CLC_FUNCTION __clc_isordered #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISORDERED_H__ diff --git a/libclc/clc/include/clc/relational/clc_isunordered.h b/libclc/clc/include/clc/relational/clc_isunordered.h index 36d314ff0e1b..6134ba849d22 100644 --- a/libclc/clc/include/clc/relational/clc_isunordered.h +++ b/libclc/clc/include/clc/relational/clc_isunordered.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_ISUNORDERED_H__ #define __CLC_RELATIONAL_CLC_ISUNORDERED_H__ -#define FUNCTION __clc_isunordered +#define __CLC_FUNCTION __clc_isunordered #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_ISUNORDERED_H__ diff --git a/libclc/clc/include/clc/relational/clc_signbit.h b/libclc/clc/include/clc/relational/clc_signbit.h index 9e423ab44895..a8e04ce494f2 100644 --- a/libclc/clc/include/clc/relational/clc_signbit.h +++ b/libclc/clc/include/clc/relational/clc_signbit.h @@ -9,11 +9,11 @@ #ifndef __CLC_RELATIONAL_CLC_SIGNBIT_H__ #define __CLC_RELATIONAL_CLC_SIGNBIT_H__ -#define FUNCTION __clc_signbit +#define __CLC_FUNCTION __clc_signbit #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_RELATIONAL_CLC_SIGNBIT_H__ diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h index e2b577019a50..fa2223234994 100644 --- a/libclc/clc/include/clc/relational/relational.h +++ b/libclc/clc/include/clc/relational/relational.h @@ -14,29 +14,29 @@ * when the result is true. */ -#define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \ - ARG1_TYPE, ARG2_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ +#define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY( \ + RET_TYPE, RET_TYPE_VEC, __CLC_FUNCTION, ARG1_TYPE, ARG2_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG1_TYPE x, ARG2_TYPE y) { \ return _CLC_RELATIONAL_OP(x, y); \ } \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##2 FUNCTION(ARG1_TYPE##2 x, \ - ARG2_TYPE##2 y) { \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##2 __CLC_FUNCTION(ARG1_TYPE##2 x, \ + ARG2_TYPE##2 y) { \ return _CLC_RELATIONAL_OP(x, y); \ } \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##3 FUNCTION(ARG1_TYPE##3 x, \ - ARG2_TYPE##3 y) { \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##3 __CLC_FUNCTION(ARG1_TYPE##3 x, \ + ARG2_TYPE##3 y) { \ return _CLC_RELATIONAL_OP(x, y); \ } \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##4 FUNCTION(ARG1_TYPE##4 x, \ - ARG2_TYPE##4 y) { \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##4 __CLC_FUNCTION(ARG1_TYPE##4 x, \ + ARG2_TYPE##4 y) { \ return _CLC_RELATIONAL_OP(x, y); \ } \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##8 FUNCTION(ARG1_TYPE##8 x, \ - ARG2_TYPE##8 y) { \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##8 __CLC_FUNCTION(ARG1_TYPE##8 x, \ + ARG2_TYPE##8 y) { \ return _CLC_RELATIONAL_OP(x, y); \ } \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##16 FUNCTION(ARG1_TYPE##16 x, \ - ARG2_TYPE##16 y) { \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##16 __CLC_FUNCTION(ARG1_TYPE##16 x, \ + ARG2_TYPE##16 y) { \ return _CLC_RELATIONAL_OP(x, y); \ } @@ -49,20 +49,25 @@ (__FPCLASS_NEGNORMAL | __FPCLASS_NEGSUBNORMAL | __FPCLASS_NEGZERO) #define fcFinite (fcPosFinite | fcNegFinite) -#define _CLC_DEFINE_ISFPCLASS_VEC(RET_TYPE, FUNCTION, MASK, ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ +#define _CLC_DEFINE_ISFPCLASS_VEC(RET_TYPE, __CLC_FUNCTION, MASK, ARG_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ return (RET_TYPE)(__builtin_isfpclass(x, (MASK)) != (RET_TYPE)0); \ } -#define _CLC_DEFINE_ISFPCLASS(RET_TYPE, VEC_RET_TYPE, FUNCTION, MASK, \ +#define _CLC_DEFINE_ISFPCLASS(RET_TYPE, VEC_RET_TYPE, __CLC_FUNCTION, MASK, \ ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ return __builtin_isfpclass(x, (MASK)); \ } \ - _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##2, FUNCTION, MASK, ARG_TYPE##2) \ - _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##3, FUNCTION, MASK, ARG_TYPE##3) \ - _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##4, FUNCTION, MASK, ARG_TYPE##4) \ - _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##8, FUNCTION, MASK, ARG_TYPE##8) \ - _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##16, FUNCTION, MASK, ARG_TYPE##16) + _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##2, __CLC_FUNCTION, MASK, \ + ARG_TYPE##2) \ + _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##3, __CLC_FUNCTION, MASK, \ + ARG_TYPE##3) \ + _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##4, __CLC_FUNCTION, MASK, \ + ARG_TYPE##4) \ + _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##8, __CLC_FUNCTION, MASK, \ + ARG_TYPE##8) \ + _CLC_DEFINE_ISFPCLASS_VEC(VEC_RET_TYPE##16, __CLC_FUNCTION, MASK, \ + ARG_TYPE##16) #endif // __CLC_RELATIONAL_RELATIONAL_H__ diff --git a/libclc/clc/include/clc/relational/unary_decl.inc b/libclc/clc/include/clc/relational/unary_decl.inc index f8123eee316c..df7419404025 100644 --- a/libclc/clc/include/clc/relational/unary_decl.inc +++ b/libclc/clc/include/clc/relational/unary_decl.inc @@ -7,11 +7,12 @@ //===----------------------------------------------------------------------===// #if __CLC_VECSIZE_OR_1 == 1 -#define __RETTYPE __CLC_INTN +#define __CLC_RETTYPE __CLC_INTN #else -#define __RETTYPE __CLC_BIT_INTN +#define __CLC_RETTYPE __CLC_BIT_INTN #endif -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __RETTYPE FUNCTION(__CLC_GENTYPE x); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_RETTYPE +__CLC_FUNCTION(__CLC_GENTYPE x); -#undef __RETTYPE +#undef __CLC_RETTYPE diff --git a/libclc/clc/include/clc/shared/binary_decl.inc b/libclc/clc/include/clc/shared/binary_decl.inc index c6e2b2a5b060..f7f49519f6fc 100644 --- a/libclc/clc/include/clc/shared/binary_decl.inc +++ b/libclc/clc/include/clc/shared/binary_decl.inc @@ -6,5 +6,5 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - __CLC_GENTYPE y); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y); diff --git a/libclc/clc/include/clc/shared/binary_decl_with_int_second_arg.inc b/libclc/clc/include/clc/shared/binary_decl_with_int_second_arg.inc index a091eb8e8d1d..25cf52c84811 100644 --- a/libclc/clc/include/clc/shared/binary_decl_with_int_second_arg.inc +++ b/libclc/clc/include/clc/shared/binary_decl_with_int_second_arg.inc @@ -8,5 +8,5 @@ #include -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, - __CLC_INTN y); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_INTN y); diff --git a/libclc/clc/include/clc/shared/binary_def.inc b/libclc/clc/include/clc/shared/binary_def.inc index 9ab5a7c5a79e..5487ad926768 100644 --- a/libclc/clc/include/clc/shared/binary_def.inc +++ b/libclc/clc/include/clc/shared/binary_def.inc @@ -8,11 +8,11 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_GENTYPE b) { - return __IMPL_FUNCTION(FUNCTION)(a, b); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_GENTYPE b) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a, b); } diff --git a/libclc/clc/include/clc/shared/binary_def_scalarize.inc b/libclc/clc/include/clc/shared/binary_def_scalarize.inc index 53d4d3fa23b1..2505f2ee0da2 100644 --- a/libclc/clc/include/clc/shared/binary_def_scalarize.inc +++ b/libclc/clc/include/clc/shared/binary_def_scalarize.inc @@ -14,8 +14,8 @@ #define __CLC_MIN_VECSIZE 2 #endif -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION FUNCTION +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION __CLC_FUNCTION #endif #ifndef __CLC_DEF_SPEC @@ -35,9 +35,9 @@ #endif #if __CLC_MIN_VECSIZE == 1 -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE FUNCTION(__CLC_ARG1_TYPE x, - __CLC_ARG2_TYPE y) { - return __IMPL_FUNCTION(x, y); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x, + __CLC_ARG2_TYPE y) { + return __CLC_IMPL_FUNCTION(x, y); } #endif // __CLC_MIN_VECSIZE == 1 @@ -45,10 +45,10 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE FUNCTION(__CLC_ARG1_TYPE x, #define __CLC_RET_TYPE2 __CLC_XCONCAT(__CLC_RET_TYPE, 2) #define __CLC_ARG1_TYPE2 __CLC_XCONCAT(__CLC_ARG1_TYPE, 2) #define __CLC_ARG2_TYPE2 __CLC_XCONCAT(__CLC_ARG2_TYPE, 2) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE2 FUNCTION(__CLC_ARG1_TYPE2 x, - __CLC_ARG2_TYPE2 y) { - return (__CLC_RET_TYPE2)(__IMPL_FUNCTION(x.s0, y.s0), - __IMPL_FUNCTION(x.s1, y.s1)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE2 +__CLC_FUNCTION(__CLC_ARG1_TYPE2 x, __CLC_ARG2_TYPE2 y) { + return (__CLC_RET_TYPE2)(__CLC_IMPL_FUNCTION(x.s0, y.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1)); } #undef __CLC_RET_TYPE2 #undef __CLC_ARG1_TYPE2 @@ -62,11 +62,11 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE2 FUNCTION(__CLC_ARG1_TYPE2 x, #define __CLC_RET_TYPE3 __CLC_XCONCAT(__CLC_RET_TYPE, 3) #define __CLC_ARG1_TYPE3 __CLC_XCONCAT(__CLC_ARG1_TYPE, 3) #define __CLC_ARG2_TYPE3 __CLC_XCONCAT(__CLC_ARG2_TYPE, 3) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE3 FUNCTION(__CLC_ARG1_TYPE3 x, - __CLC_ARG2_TYPE3 y) { - return (__CLC_RET_TYPE3)(__IMPL_FUNCTION(x.s0, y.s0), - __IMPL_FUNCTION(x.s1, y.s1), - __IMPL_FUNCTION(x.s2, y.s2)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE3 +__CLC_FUNCTION(__CLC_ARG1_TYPE3 x, __CLC_ARG2_TYPE3 y) { + return (__CLC_RET_TYPE3)(__CLC_IMPL_FUNCTION(x.s0, y.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2)); } #undef __CLC_RET_TYPE3 #undef __CLC_ARG1_TYPE3 @@ -75,12 +75,12 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE3 FUNCTION(__CLC_ARG1_TYPE3 x, #define __CLC_RET_TYPE4 __CLC_XCONCAT(__CLC_RET_TYPE, 4) #define __CLC_ARG1_TYPE4 __CLC_XCONCAT(__CLC_ARG1_TYPE, 4) #define __CLC_ARG2_TYPE4 __CLC_XCONCAT(__CLC_ARG2_TYPE, 4) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE4 FUNCTION(__CLC_ARG1_TYPE4 x, - __CLC_ARG2_TYPE4 y) { - return (__CLC_RET_TYPE4)(__IMPL_FUNCTION(x.s0, y.s0), - __IMPL_FUNCTION(x.s1, y.s1), - __IMPL_FUNCTION(x.s2, y.s2), - __IMPL_FUNCTION(x.s3, y.s3)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE4 +__CLC_FUNCTION(__CLC_ARG1_TYPE4 x, __CLC_ARG2_TYPE4 y) { + return (__CLC_RET_TYPE4)(__CLC_IMPL_FUNCTION(x.s0, y.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2), + __CLC_IMPL_FUNCTION(x.s3, y.s3)); } #undef __CLC_RET_TYPE4 #undef __CLC_ARG1_TYPE4 @@ -89,14 +89,16 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE4 FUNCTION(__CLC_ARG1_TYPE4 x, #define __CLC_RET_TYPE8 __CLC_XCONCAT(__CLC_RET_TYPE, 8) #define __CLC_ARG1_TYPE8 __CLC_XCONCAT(__CLC_ARG1_TYPE, 8) #define __CLC_ARG2_TYPE8 __CLC_XCONCAT(__CLC_ARG2_TYPE, 8) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE8 FUNCTION(__CLC_ARG1_TYPE8 x, - __CLC_ARG2_TYPE8 y) { - return ( - __CLC_RET_TYPE8)(__IMPL_FUNCTION(x.s0, y.s0), __IMPL_FUNCTION(x.s1, y.s1), - __IMPL_FUNCTION(x.s2, y.s2), __IMPL_FUNCTION(x.s3, y.s3), - __IMPL_FUNCTION(x.s4, y.s4), __IMPL_FUNCTION(x.s5, y.s5), - __IMPL_FUNCTION(x.s6, y.s6), - __IMPL_FUNCTION(x.s7, y.s7)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE8 +__CLC_FUNCTION(__CLC_ARG1_TYPE8 x, __CLC_ARG2_TYPE8 y) { + return (__CLC_RET_TYPE8)(__CLC_IMPL_FUNCTION(x.s0, y.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2), + __CLC_IMPL_FUNCTION(x.s3, y.s3), + __CLC_IMPL_FUNCTION(x.s4, y.s4), + __CLC_IMPL_FUNCTION(x.s5, y.s5), + __CLC_IMPL_FUNCTION(x.s6, y.s6), + __CLC_IMPL_FUNCTION(x.s7, y.s7)); } #undef __CLC_RET_TYPE8 #undef __CLC_ARG1_TYPE8 @@ -105,24 +107,24 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE8 FUNCTION(__CLC_ARG1_TYPE8 x, #define __CLC_RET_TYPE16 __CLC_XCONCAT(__CLC_RET_TYPE, 16) #define __CLC_ARG1_TYPE16 __CLC_XCONCAT(__CLC_ARG1_TYPE, 16) #define __CLC_ARG2_TYPE16 __CLC_XCONCAT(__CLC_ARG2_TYPE, 16) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE16 FUNCTION(__CLC_ARG1_TYPE16 x, - __CLC_ARG2_TYPE16 y) { - return (__CLC_RET_TYPE16)(__IMPL_FUNCTION(x.s0, y.s0), - __IMPL_FUNCTION(x.s1, y.s1), - __IMPL_FUNCTION(x.s2, y.s2), - __IMPL_FUNCTION(x.s3, y.s3), - __IMPL_FUNCTION(x.s4, y.s4), - __IMPL_FUNCTION(x.s5, y.s5), - __IMPL_FUNCTION(x.s6, y.s6), - __IMPL_FUNCTION(x.s7, y.s7), - __IMPL_FUNCTION(x.s8, y.s8), - __IMPL_FUNCTION(x.s9, y.s9), - __IMPL_FUNCTION(x.sa, y.sa), - __IMPL_FUNCTION(x.sb, y.sb), - __IMPL_FUNCTION(x.sc, y.sc), - __IMPL_FUNCTION(x.sd, y.sd), - __IMPL_FUNCTION(x.se, y.se), - __IMPL_FUNCTION(x.sf, y.sf)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE16 +__CLC_FUNCTION(__CLC_ARG1_TYPE16 x, __CLC_ARG2_TYPE16 y) { + return (__CLC_RET_TYPE16)(__CLC_IMPL_FUNCTION(x.s0, y.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2), + __CLC_IMPL_FUNCTION(x.s3, y.s3), + __CLC_IMPL_FUNCTION(x.s4, y.s4), + __CLC_IMPL_FUNCTION(x.s5, y.s5), + __CLC_IMPL_FUNCTION(x.s6, y.s6), + __CLC_IMPL_FUNCTION(x.s7, y.s7), + __CLC_IMPL_FUNCTION(x.s8, y.s8), + __CLC_IMPL_FUNCTION(x.s9, y.s9), + __CLC_IMPL_FUNCTION(x.sa, y.sa), + __CLC_IMPL_FUNCTION(x.sb, y.sb), + __CLC_IMPL_FUNCTION(x.sc, y.sc), + __CLC_IMPL_FUNCTION(x.sd, y.sd), + __CLC_IMPL_FUNCTION(x.se, y.se), + __CLC_IMPL_FUNCTION(x.sf, y.sf)); } #undef __CLC_RET_TYPE16 #undef __CLC_ARG1_TYPE16 diff --git a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg.inc b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg.inc index 262b6b21cc64..2c32754044db 100644 --- a/libclc/clc/include/clc/shared/binary_def_with_int_second_arg.inc +++ b/libclc/clc/include/clc/shared/binary_def_with_int_second_arg.inc @@ -8,10 +8,11 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_INTN y) { - return __IMPL_FUNCTION(FUNCTION)(x, y); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE x, + __CLC_INTN y) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(x, y); } diff --git a/libclc/clc/include/clc/shared/binary_def_with_scalar_second_arg.inc b/libclc/clc/include/clc/shared/binary_def_with_scalar_second_arg.inc index 103f775ab87e..7a6283d7190d 100644 --- a/libclc/clc/include/clc/shared/binary_def_with_scalar_second_arg.inc +++ b/libclc/clc/include/clc/shared/binary_def_with_scalar_second_arg.inc @@ -8,18 +8,18 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_GENTYPE b) { - return __IMPL_FUNCTION(FUNCTION)(a, b); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_GENTYPE b) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a, b); } #ifndef __CLC_SCALAR -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_SCALAR_GENTYPE b) { - return __IMPL_FUNCTION(FUNCTION)(a, (__CLC_GENTYPE)b); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_SCALAR_GENTYPE b) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a, (__CLC_GENTYPE)b); } #endif diff --git a/libclc/clc/include/clc/shared/clc_vload.inc b/libclc/clc/include/clc/shared/clc_vload.inc index 8f3b00ec0445..70bca250e413 100644 --- a/libclc/clc/include/clc/shared/clc_vload.inc +++ b/libclc/clc/include/clc/shared/clc_vload.inc @@ -6,29 +6,29 @@ // //===----------------------------------------------------------------------===// -#define CLC_VLOAD_NAME __CLC_XCONCAT(__clc_vload, __CLC_VECSIZE) -#define CLC_VLOAD_HALF_NAME __CLC_XCONCAT(__clc_vload_half, __CLC_VECSIZE) -#define CLC_VLOADA_HALF_NAME __CLC_XCONCAT(__clc_vloada_half, __CLC_VECSIZE) +#define __CLC_VLOAD_NAME __CLC_XCONCAT(__clc_vload, __CLC_VECSIZE) +#define __CLC_VLOAD_HALF_NAME __CLC_XCONCAT(__clc_vload_half, __CLC_VECSIZE) +#define __CLC_VLOADA_HALF_NAME __CLC_XCONCAT(__clc_vloada_half, __CLC_VECSIZE) #ifndef __CLC_SCALAR -#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define __CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) -#define CLC_VLOAD_DECL(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL CLC_VLOAD_TY CLC_VLOAD_NAME( \ +#define __CLC_VLOAD_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_VLOAD_TY __CLC_VLOAD_NAME( \ size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x); -CLC_VLOAD_DECL(__private) -CLC_VLOAD_DECL(__local) -CLC_VLOAD_DECL(__constant) -CLC_VLOAD_DECL(__global) +__CLC_VLOAD_DECL(__private) +__CLC_VLOAD_DECL(__local) +__CLC_VLOAD_DECL(__constant) +__CLC_VLOAD_DECL(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_VLOAD_DECL(__generic) +__CLC_VLOAD_DECL(__generic) #endif -#undef CLC_VLOAD_DECL -#undef CLC_VLOAD_TY +#undef __CLC_VLOAD_DECL +#undef __CLC_VLOAD_TY #endif // __CLC_SCALAR @@ -38,27 +38,27 @@ CLC_VLOAD_DECL(__generic) #ifdef __CLC_FPSIZE #if __CLC_FPSIZE == 32 -#define CLC_VLOAD_HALF_DECL(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOAD_HALF_NAME( \ +#define __CLC_VLOAD_HALF_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_VLOAD_HALF_NAME( \ size_t offset, const ADDRSPACE half *mem); \ \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE CLC_VLOADA_HALF_NAME( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_VLOADA_HALF_NAME( \ size_t offset, const ADDRSPACE half *mem); -CLC_VLOAD_HALF_DECL(__private) -CLC_VLOAD_HALF_DECL(__local) -CLC_VLOAD_HALF_DECL(__constant) -CLC_VLOAD_HALF_DECL(__global) +__CLC_VLOAD_HALF_DECL(__private) +__CLC_VLOAD_HALF_DECL(__local) +__CLC_VLOAD_HALF_DECL(__constant) +__CLC_VLOAD_HALF_DECL(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_VLOAD_HALF_DECL(__generic) +__CLC_VLOAD_HALF_DECL(__generic) #endif -#undef CLC_VLOAD_HALF_DECL +#undef __CLC_VLOAD_HALF_DECL #endif #endif -#undef CLC_VLOAD_NAME -#undef CLC_VLOAD_HALF_NAME -#undef CLC_VLOADA_HALF_NAME +#undef __CLC_VLOAD_NAME +#undef __CLC_VLOAD_HALF_NAME +#undef __CLC_VLOADA_HALF_NAME diff --git a/libclc/clc/include/clc/shared/clc_vstore.inc b/libclc/clc/include/clc/shared/clc_vstore.inc index 38d54b2f1b67..cd3377e077af 100644 --- a/libclc/clc/include/clc/shared/clc_vstore.inc +++ b/libclc/clc/include/clc/shared/clc_vstore.inc @@ -6,28 +6,28 @@ // //===----------------------------------------------------------------------===// -#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) -#define CLC_VSTORE_NAME __CLC_XCONCAT(__clc_vstore, __CLC_VECSIZE) -#define CLC_VSTORE_HALF_NAME(x) \ +#define __CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define __CLC_VSTORE_NAME __CLC_XCONCAT(__clc_vstore, __CLC_VECSIZE) +#define __CLC_VSTORE_HALF_NAME(x) \ __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstore_half, __CLC_VECSIZE), x) -#define CLC_VSTOREA_HALF_NAME(x) \ +#define __CLC_VSTOREA_HALF_NAME(x) \ __CLC_XCONCAT(__CLC_XCONCAT(__clc_vstorea_half, __CLC_VECSIZE), x) #ifndef __CLC_SCALAR -#define CLC_VSTORE_DECL(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_NAME( \ - CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p); +#define __CLC_VSTORE_DECL(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DECL void __CLC_VSTORE_NAME( \ + __CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p); -CLC_VSTORE_DECL(__private) -CLC_VSTORE_DECL(__local) -CLC_VSTORE_DECL(__global) +__CLC_VSTORE_DECL(__private) +__CLC_VSTORE_DECL(__local) +__CLC_VSTORE_DECL(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_VSTORE_DECL(__generic) +__CLC_VSTORE_DECL(__generic) #endif -#undef CLC_VSTORE_DECL +#undef __CLC_VSTORE_DECL #endif // __CLC_SCALAR @@ -36,35 +36,35 @@ CLC_VSTORE_DECL(__generic) #ifdef __CLC_FPSIZE #if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 -#define CLC_VSTORE_HALF_DECL(ADDRSPACE, SUFFIX) \ - _CLC_OVERLOAD _CLC_DECL void CLC_VSTORE_HALF_NAME(SUFFIX)( \ - CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); \ +#define __CLC_VSTORE_HALF_DECL(ADDRSPACE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DECL void __CLC_VSTORE_HALF_NAME(SUFFIX)( \ + __CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); \ \ - _CLC_OVERLOAD _CLC_DECL void CLC_VSTOREA_HALF_NAME(SUFFIX)( \ - CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); + _CLC_OVERLOAD _CLC_DECL void __CLC_VSTOREA_HALF_NAME(SUFFIX)( \ + __CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p); -#define CLC_VSTORE_HALF_DECL_ALL_MODES(ADDRSPACE) \ - CLC_VSTORE_HALF_DECL(ADDRSPACE, ) \ - CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtz) \ - CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtn) \ - CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtp) \ - CLC_VSTORE_HALF_DECL(ADDRSPACE, _rte) +#define __CLC_VSTORE_HALF_DECL_ALL_MODES(ADDRSPACE) \ + __CLC_VSTORE_HALF_DECL(ADDRSPACE, ) \ + __CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtz) \ + __CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtn) \ + __CLC_VSTORE_HALF_DECL(ADDRSPACE, _rtp) \ + __CLC_VSTORE_HALF_DECL(ADDRSPACE, _rte) -CLC_VSTORE_HALF_DECL_ALL_MODES(__private) -CLC_VSTORE_HALF_DECL_ALL_MODES(__local) -CLC_VSTORE_HALF_DECL_ALL_MODES(__global) +__CLC_VSTORE_HALF_DECL_ALL_MODES(__private) +__CLC_VSTORE_HALF_DECL_ALL_MODES(__local) +__CLC_VSTORE_HALF_DECL_ALL_MODES(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_VSTORE_HALF_DECL_ALL_MODES(__generic) +__CLC_VSTORE_HALF_DECL_ALL_MODES(__generic) #endif -#undef CLC_VSTORE_HALF_DECL -#undef CLC_VSTORE_HALF_DECL_ALL_MODES +#undef __CLC_VSTORE_HALF_DECL +#undef __CLC_VSTORE_HALF_DECL_ALL_MODES #endif #endif -#undef CLC_VSTORE_TY -#undef CLC_VSTORE_NAME -#undef CLC_VSTORE_HALF_NAME -#undef CLC_VSTOREA_HALF_NAME +#undef __CLC_VSTORE_TY +#undef __CLC_VSTORE_NAME +#undef __CLC_VSTORE_HALF_NAME +#undef __CLC_VSTOREA_HALF_NAME diff --git a/libclc/clc/include/clc/shared/ternary_decl.inc b/libclc/clc/include/clc/shared/ternary_decl.inc index 1dc8672e3f42..21acb0042518 100644 --- a/libclc/clc/include/clc/shared/ternary_decl.inc +++ b/libclc/clc/include/clc/shared/ternary_decl.inc @@ -6,6 +6,5 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, - __CLC_GENTYPE b, - __CLC_GENTYPE c); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b, __CLC_GENTYPE c); diff --git a/libclc/clc/include/clc/shared/ternary_def.inc b/libclc/clc/include/clc/shared/ternary_def.inc index 92134e8cb6f2..566cfb4ca58c 100644 --- a/libclc/clc/include/clc/shared/ternary_def.inc +++ b/libclc/clc/include/clc/shared/ternary_def.inc @@ -8,11 +8,12 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b, - __CLC_GENTYPE c) { - return __IMPL_FUNCTION(FUNCTION)(a, b, c); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_GENTYPE b, + __CLC_GENTYPE c) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a, b, c); } diff --git a/libclc/clc/include/clc/shared/ternary_def_scalarize.inc b/libclc/clc/include/clc/shared/ternary_def_scalarize.inc index c216fa4db4ca..7bdaee85a612 100644 --- a/libclc/clc/include/clc/shared/ternary_def_scalarize.inc +++ b/libclc/clc/include/clc/shared/ternary_def_scalarize.inc @@ -14,8 +14,8 @@ #define __CLC_MIN_VECSIZE 2 #endif -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION FUNCTION +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION __CLC_FUNCTION #endif #ifndef __CLC_DEF_SPEC @@ -39,10 +39,10 @@ #endif #if __CLC_MIN_VECSIZE == 1 -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE FUNCTION(__CLC_ARG1_TYPE x, - __CLC_ARG2_TYPE y, - __CLC_ARG3_TYPE z) { - return __IMPL_FUNCTION(x, y, z); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x, + __CLC_ARG2_TYPE y, + __CLC_ARG3_TYPE z) { + return __CLC_IMPL_FUNCTION(x, y, z); } #endif // __CLC_MIN_VECSIZE == 1 @@ -51,11 +51,10 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE FUNCTION(__CLC_ARG1_TYPE x, #define __CLC_ARG1_TYPE2 __CLC_XCONCAT(__CLC_ARG1_TYPE, 2) #define __CLC_ARG2_TYPE2 __CLC_XCONCAT(__CLC_ARG2_TYPE, 2) #define __CLC_ARG3_TYPE2 __CLC_XCONCAT(__CLC_ARG3_TYPE, 2) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE2 FUNCTION(__CLC_ARG1_TYPE2 x, - __CLC_ARG2_TYPE2 y, - __CLC_ARG3_TYPE2 z) { - return (__CLC_RET_TYPE2)(__IMPL_FUNCTION(x.s0, y.s0, z.s0), - __IMPL_FUNCTION(x.s1, y.s1, z.s1)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE2 +__CLC_FUNCTION(__CLC_ARG1_TYPE2 x, __CLC_ARG2_TYPE2 y, __CLC_ARG3_TYPE2 z) { + return (__CLC_RET_TYPE2)(__CLC_IMPL_FUNCTION(x.s0, y.s0, z.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1, z.s1)); } #undef __CLC_RET_TYPE2 #undef __CLC_ARG1_TYPE2 @@ -71,12 +70,11 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE2 FUNCTION(__CLC_ARG1_TYPE2 x, #define __CLC_ARG1_TYPE3 __CLC_XCONCAT(__CLC_ARG1_TYPE, 3) #define __CLC_ARG2_TYPE3 __CLC_XCONCAT(__CLC_ARG2_TYPE, 3) #define __CLC_ARG3_TYPE3 __CLC_XCONCAT(__CLC_ARG3_TYPE, 3) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE3 FUNCTION(__CLC_ARG1_TYPE3 x, - __CLC_ARG2_TYPE3 y, - __CLC_ARG3_TYPE3 z) { - return (__CLC_RET_TYPE3)(__IMPL_FUNCTION(x.s0, y.s0, z.s0), - __IMPL_FUNCTION(x.s1, y.s1, z.s1), - __IMPL_FUNCTION(x.s2, y.s2, z.s2)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE3 +__CLC_FUNCTION(__CLC_ARG1_TYPE3 x, __CLC_ARG2_TYPE3 y, __CLC_ARG3_TYPE3 z) { + return (__CLC_RET_TYPE3)(__CLC_IMPL_FUNCTION(x.s0, y.s0, z.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1, z.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2, z.s2)); } #undef __CLC_RET_TYPE3 #undef __CLC_ARG1_TYPE3 @@ -87,13 +85,12 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE3 FUNCTION(__CLC_ARG1_TYPE3 x, #define __CLC_ARG1_TYPE4 __CLC_XCONCAT(__CLC_ARG1_TYPE, 4) #define __CLC_ARG2_TYPE4 __CLC_XCONCAT(__CLC_ARG2_TYPE, 4) #define __CLC_ARG3_TYPE4 __CLC_XCONCAT(__CLC_ARG3_TYPE, 4) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE4 FUNCTION(__CLC_ARG1_TYPE4 x, - __CLC_ARG2_TYPE4 y, - __CLC_ARG3_TYPE4 z) { - return (__CLC_RET_TYPE4)(__IMPL_FUNCTION(x.s0, y.s0, z.s0), - __IMPL_FUNCTION(x.s1, y.s1, z.s1), - __IMPL_FUNCTION(x.s2, y.s2, z.s2), - __IMPL_FUNCTION(x.s3, y.s3, z.s3)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE4 +__CLC_FUNCTION(__CLC_ARG1_TYPE4 x, __CLC_ARG2_TYPE4 y, __CLC_ARG3_TYPE4 z) { + return (__CLC_RET_TYPE4)(__CLC_IMPL_FUNCTION(x.s0, y.s0, z.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1, z.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2, z.s2), + __CLC_IMPL_FUNCTION(x.s3, y.s3, z.s3)); } #undef __CLC_RET_TYPE4 #undef __CLC_ARG1_TYPE4 @@ -104,17 +101,16 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE4 FUNCTION(__CLC_ARG1_TYPE4 x, #define __CLC_ARG1_TYPE8 __CLC_XCONCAT(__CLC_ARG1_TYPE, 8) #define __CLC_ARG2_TYPE8 __CLC_XCONCAT(__CLC_ARG2_TYPE, 8) #define __CLC_ARG3_TYPE8 __CLC_XCONCAT(__CLC_ARG3_TYPE, 8) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE8 FUNCTION(__CLC_ARG1_TYPE8 x, - __CLC_ARG2_TYPE8 y, - __CLC_ARG3_TYPE8 z) { - return (__CLC_RET_TYPE8)(__IMPL_FUNCTION(x.s0, y.s0, z.s0), - __IMPL_FUNCTION(x.s1, y.s1, z.s1), - __IMPL_FUNCTION(x.s2, y.s2, z.s2), - __IMPL_FUNCTION(x.s3, y.s3, z.s3), - __IMPL_FUNCTION(x.s4, y.s4, z.s4), - __IMPL_FUNCTION(x.s5, y.s5, z.s5), - __IMPL_FUNCTION(x.s6, y.s6, z.s6), - __IMPL_FUNCTION(x.s7, y.s7, z.s7)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE8 +__CLC_FUNCTION(__CLC_ARG1_TYPE8 x, __CLC_ARG2_TYPE8 y, __CLC_ARG3_TYPE8 z) { + return (__CLC_RET_TYPE8)(__CLC_IMPL_FUNCTION(x.s0, y.s0, z.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1, z.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2, z.s2), + __CLC_IMPL_FUNCTION(x.s3, y.s3, z.s3), + __CLC_IMPL_FUNCTION(x.s4, y.s4, z.s4), + __CLC_IMPL_FUNCTION(x.s5, y.s5, z.s5), + __CLC_IMPL_FUNCTION(x.s6, y.s6, z.s6), + __CLC_IMPL_FUNCTION(x.s7, y.s7, z.s7)); } #undef __CLC_RET_TYPE8 #undef __CLC_ARG1_TYPE8 @@ -125,25 +121,24 @@ _CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE8 FUNCTION(__CLC_ARG1_TYPE8 x, #define __CLC_ARG1_TYPE16 __CLC_XCONCAT(__CLC_ARG1_TYPE, 16) #define __CLC_ARG2_TYPE16 __CLC_XCONCAT(__CLC_ARG2_TYPE, 16) #define __CLC_ARG3_TYPE16 __CLC_XCONCAT(__CLC_ARG3_TYPE, 16) -_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE16 FUNCTION(__CLC_ARG1_TYPE16 x, - __CLC_ARG2_TYPE16 y, - __CLC_ARG3_TYPE16 z) { - return (__CLC_RET_TYPE16)(__IMPL_FUNCTION(x.s0, y.s0, z.s0), - __IMPL_FUNCTION(x.s1, y.s1, z.s1), - __IMPL_FUNCTION(x.s2, y.s2, z.s2), - __IMPL_FUNCTION(x.s3, y.s3, z.s3), - __IMPL_FUNCTION(x.s4, y.s4, z.s4), - __IMPL_FUNCTION(x.s5, y.s5, z.s5), - __IMPL_FUNCTION(x.s6, y.s6, z.s6), - __IMPL_FUNCTION(x.s7, y.s7, z.s7), - __IMPL_FUNCTION(x.s8, y.s8, z.s8), - __IMPL_FUNCTION(x.s9, y.s9, z.s9), - __IMPL_FUNCTION(x.sa, y.sa, z.sa), - __IMPL_FUNCTION(x.sb, y.sb, z.sb), - __IMPL_FUNCTION(x.sc, y.sc, z.sc), - __IMPL_FUNCTION(x.sd, y.sd, z.sd), - __IMPL_FUNCTION(x.se, y.se, z.se), - __IMPL_FUNCTION(x.sf, y.sf, z.sf)); +_CLC_OVERLOAD __CLC_DEF_SPEC __CLC_RET_TYPE16 +__CLC_FUNCTION(__CLC_ARG1_TYPE16 x, __CLC_ARG2_TYPE16 y, __CLC_ARG3_TYPE16 z) { + return (__CLC_RET_TYPE16)(__CLC_IMPL_FUNCTION(x.s0, y.s0, z.s0), + __CLC_IMPL_FUNCTION(x.s1, y.s1, z.s1), + __CLC_IMPL_FUNCTION(x.s2, y.s2, z.s2), + __CLC_IMPL_FUNCTION(x.s3, y.s3, z.s3), + __CLC_IMPL_FUNCTION(x.s4, y.s4, z.s4), + __CLC_IMPL_FUNCTION(x.s5, y.s5, z.s5), + __CLC_IMPL_FUNCTION(x.s6, y.s6, z.s6), + __CLC_IMPL_FUNCTION(x.s7, y.s7, z.s7), + __CLC_IMPL_FUNCTION(x.s8, y.s8, z.s8), + __CLC_IMPL_FUNCTION(x.s9, y.s9, z.s9), + __CLC_IMPL_FUNCTION(x.sa, y.sa, z.sa), + __CLC_IMPL_FUNCTION(x.sb, y.sb, z.sb), + __CLC_IMPL_FUNCTION(x.sc, y.sc, z.sc), + __CLC_IMPL_FUNCTION(x.sd, y.sd, z.sd), + __CLC_IMPL_FUNCTION(x.se, y.se, z.se), + __CLC_IMPL_FUNCTION(x.sf, y.sf, z.sf)); } #undef __CLC_RET_TYPE16 #undef __CLC_ARG1_TYPE16 diff --git a/libclc/clc/include/clc/shared/unary_decl.inc b/libclc/clc/include/clc/shared/unary_decl.inc index 46108c5fac9e..19f1c8a876e5 100644 --- a/libclc/clc/include/clc/shared/unary_decl.inc +++ b/libclc/clc/include/clc/shared/unary_decl.inc @@ -6,4 +6,5 @@ // //===----------------------------------------------------------------------===// -_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x); +_CLC_OVERLOAD _CLC_CONST _CLC_DECL __CLC_GENTYPE +__CLC_FUNCTION(__CLC_GENTYPE x); diff --git a/libclc/clc/include/clc/shared/unary_def.inc b/libclc/clc/include/clc/shared/unary_def.inc index e746a6bc89f3..b6826b023c70 100644 --- a/libclc/clc/include/clc/shared/unary_def.inc +++ b/libclc/clc/include/clc/shared/unary_def.inc @@ -8,10 +8,10 @@ #include -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a) { - return __IMPL_FUNCTION(FUNCTION)(a); +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION(__CLC_GENTYPE a) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a); } diff --git a/libclc/clc/include/clc/shared/unary_def_scalarize.inc b/libclc/clc/include/clc/shared/unary_def_scalarize.inc index 6c82babad2b5..e941918312da 100644 --- a/libclc/clc/include/clc/shared/unary_def_scalarize.inc +++ b/libclc/clc/include/clc/shared/unary_def_scalarize.inc @@ -14,8 +14,8 @@ #define __CLC_MIN_VECSIZE 2 #endif -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION FUNCTION +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION __CLC_FUNCTION #endif #ifndef __CLC_RET_TYPE @@ -31,16 +31,17 @@ #endif #if __CLC_MIN_VECSIZE == 1 -_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE FUNCTION(__CLC_ARG1_TYPE x) { - return __IMPL_FUNCTION(x); +_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x) { + return __CLC_IMPL_FUNCTION(x); } #endif // __CLC_MIN_VECSIZE == 1 #if __CLC_MIN_VECSIZE <= 2 #define __CLC_RET_TYPE2 __CLC_XCONCAT(__CLC_RET_TYPE, 2) #define __CLC_ARG1_TYPE2 __CLC_XCONCAT(__CLC_ARG1_TYPE, 2) -_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE2 FUNCTION(__CLC_ARG1_TYPE2 x) { - return (__CLC_RET_TYPE2)(__IMPL_FUNCTION(x.s0), __IMPL_FUNCTION(x.s1)); +_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE2 __CLC_FUNCTION(__CLC_ARG1_TYPE2 x) { + return (__CLC_RET_TYPE2)(__CLC_IMPL_FUNCTION(x.s0), + __CLC_IMPL_FUNCTION(x.s1)); } #undef __CLC_RET_TYPE2 #undef __CLC_ARG1_TYPE2 @@ -52,44 +53,47 @@ _CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE2 FUNCTION(__CLC_ARG1_TYPE2 x) { #define __CLC_RET_TYPE3 __CLC_XCONCAT(__CLC_RET_TYPE, 3) #define __CLC_ARG1_TYPE3 __CLC_XCONCAT(__CLC_ARG1_TYPE, 3) -_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE3 FUNCTION(__CLC_ARG1_TYPE3 x) { - return (__CLC_RET_TYPE3)(__IMPL_FUNCTION(x.s0), __IMPL_FUNCTION(x.s1), - __IMPL_FUNCTION(x.s2)); +_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE3 __CLC_FUNCTION(__CLC_ARG1_TYPE3 x) { + return (__CLC_RET_TYPE3)(__CLC_IMPL_FUNCTION(x.s0), __CLC_IMPL_FUNCTION(x.s1), + __CLC_IMPL_FUNCTION(x.s2)); } #undef __CLC_RET_TYPE3 #undef __CLC_ARG1_TYPE3 #define __CLC_RET_TYPE4 __CLC_XCONCAT(__CLC_RET_TYPE, 4) #define __CLC_ARG1_TYPE4 __CLC_XCONCAT(__CLC_ARG1_TYPE, 4) -_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE4 FUNCTION(__CLC_ARG1_TYPE4 x) { - return (__CLC_RET_TYPE4)(__IMPL_FUNCTION(x.s0), __IMPL_FUNCTION(x.s1), - __IMPL_FUNCTION(x.s2), __IMPL_FUNCTION(x.s3)); +_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE4 __CLC_FUNCTION(__CLC_ARG1_TYPE4 x) { + return (__CLC_RET_TYPE4)(__CLC_IMPL_FUNCTION(x.s0), __CLC_IMPL_FUNCTION(x.s1), + __CLC_IMPL_FUNCTION(x.s2), + __CLC_IMPL_FUNCTION(x.s3)); } #undef __CLC_RET_TYPE4 #undef __CLC_ARG1_TYPE4 #define __CLC_RET_TYPE8 __CLC_XCONCAT(__CLC_RET_TYPE, 8) #define __CLC_ARG1_TYPE8 __CLC_XCONCAT(__CLC_ARG1_TYPE, 8) -_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE8 FUNCTION(__CLC_ARG1_TYPE8 x) { - return (__CLC_RET_TYPE8)(__IMPL_FUNCTION(x.s0), __IMPL_FUNCTION(x.s1), - __IMPL_FUNCTION(x.s2), __IMPL_FUNCTION(x.s3), - __IMPL_FUNCTION(x.s4), __IMPL_FUNCTION(x.s5), - __IMPL_FUNCTION(x.s6), __IMPL_FUNCTION(x.s7)); +_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE8 __CLC_FUNCTION(__CLC_ARG1_TYPE8 x) { + return (__CLC_RET_TYPE8)(__CLC_IMPL_FUNCTION(x.s0), __CLC_IMPL_FUNCTION(x.s1), + __CLC_IMPL_FUNCTION(x.s2), __CLC_IMPL_FUNCTION(x.s3), + __CLC_IMPL_FUNCTION(x.s4), __CLC_IMPL_FUNCTION(x.s5), + __CLC_IMPL_FUNCTION(x.s6), + __CLC_IMPL_FUNCTION(x.s7)); } #undef __CLC_RET_TYPE8 #undef __CLC_ARG1_TYPE8 #define __CLC_RET_TYPE16 __CLC_XCONCAT(__CLC_RET_TYPE, 16) #define __CLC_ARG1_TYPE16 __CLC_XCONCAT(__CLC_ARG1_TYPE, 16) -_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE16 FUNCTION(__CLC_ARG1_TYPE16 x) { - return (__CLC_RET_TYPE16)(__IMPL_FUNCTION(x.s0), __IMPL_FUNCTION(x.s1), - __IMPL_FUNCTION(x.s2), __IMPL_FUNCTION(x.s3), - __IMPL_FUNCTION(x.s4), __IMPL_FUNCTION(x.s5), - __IMPL_FUNCTION(x.s6), __IMPL_FUNCTION(x.s7), - __IMPL_FUNCTION(x.s8), __IMPL_FUNCTION(x.s9), - __IMPL_FUNCTION(x.sa), __IMPL_FUNCTION(x.sb), - __IMPL_FUNCTION(x.sc), __IMPL_FUNCTION(x.sd), - __IMPL_FUNCTION(x.se), __IMPL_FUNCTION(x.sf)); +_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE16 __CLC_FUNCTION(__CLC_ARG1_TYPE16 x) { + return ( + __CLC_RET_TYPE16)(__CLC_IMPL_FUNCTION(x.s0), __CLC_IMPL_FUNCTION(x.s1), + __CLC_IMPL_FUNCTION(x.s2), __CLC_IMPL_FUNCTION(x.s3), + __CLC_IMPL_FUNCTION(x.s4), __CLC_IMPL_FUNCTION(x.s5), + __CLC_IMPL_FUNCTION(x.s6), __CLC_IMPL_FUNCTION(x.s7), + __CLC_IMPL_FUNCTION(x.s8), __CLC_IMPL_FUNCTION(x.s9), + __CLC_IMPL_FUNCTION(x.sa), __CLC_IMPL_FUNCTION(x.sb), + __CLC_IMPL_FUNCTION(x.sc), __CLC_IMPL_FUNCTION(x.sd), + __CLC_IMPL_FUNCTION(x.se), __CLC_IMPL_FUNCTION(x.sf)); } #undef __CLC_RET_TYPE16 #undef __CLC_ARG1_TYPE16 diff --git a/libclc/clc/lib/amdgcn/math/clc_ldexp_override.cl b/libclc/clc/lib/amdgcn/math/clc_ldexp_override.cl index b2990647a00a..90bd50ac1551 100644 --- a/libclc/clc/lib/amdgcn/math/clc_ldexp_override.cl +++ b/libclc/clc/lib/amdgcn/math/clc_ldexp_override.cl @@ -10,25 +10,25 @@ #include #include -#define FUNCTION __clc_ldexp +#define __CLC_FUNCTION __clc_ldexp #define __CLC_ARG2_TYPE int #define __CLC_MIN_VECSIZE 1 #ifdef __HAS_LDEXPF__ // This defines all the ldexp(floatN, intN) variants. -#define __FLOAT_ONLY -#define __IMPL_FUNCTION __builtin_amdgcn_ldexpf +#define __CLC_FLOAT_ONLY +#define __CLC_IMPL_FUNCTION __builtin_amdgcn_ldexpf #define __CLC_BODY #include -#undef __IMPL_FUNCTION +#undef __CLC_IMPL_FUNCTION #endif #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable // This defines all the ldexp(doubleN, intN) variants. -#define __DOUBLE_ONLY -#define __IMPL_FUNCTION __builtin_amdgcn_ldexp +#define __CLC_DOUBLE_ONLY +#define __CLC_IMPL_FUNCTION __builtin_amdgcn_ldexp #define __CLC_BODY #include -#undef __IMPL_FUNCTION +#undef __CLC_IMPL_FUNCTION #endif diff --git a/libclc/clc/lib/amdgcn/workitem/clc_get_group_id.cl b/libclc/clc/lib/amdgcn/workitem/clc_get_group_id.cl index aea927c3460b..4dab7905ba30 100644 --- a/libclc/clc/lib/amdgcn/workitem/clc_get_group_id.cl +++ b/libclc/clc/lib/amdgcn/workitem/clc_get_group_id.cl @@ -17,6 +17,6 @@ _CLC_DEF _CLC_OVERLOAD size_t __clc_get_group_id(uint dim) { case 2: return __builtin_amdgcn_workgroup_id_z(); default: - return 1; + return 0; } } diff --git a/libclc/clc/lib/amdgcn/workitem/clc_get_local_id.cl b/libclc/clc/lib/amdgcn/workitem/clc_get_local_id.cl index b7b7a43e735d..8d1d16d4762f 100644 --- a/libclc/clc/lib/amdgcn/workitem/clc_get_local_id.cl +++ b/libclc/clc/lib/amdgcn/workitem/clc_get_local_id.cl @@ -17,6 +17,6 @@ _CLC_DEF _CLC_OVERLOAD size_t __clc_get_local_id(uint dim) { case 2: return __builtin_amdgcn_workitem_id_z(); default: - return 1; + return 0; } } diff --git a/libclc/clc/lib/amdgpu/math/clc_half_exp.cl b/libclc/clc/lib/amdgpu/math/clc_half_exp.cl index 5fa9d0b9a9de..c332708615e1 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_exp.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_exp.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC exp -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_exp10.cl b/libclc/clc/lib/amdgpu/math/clc_half_exp10.cl index 5c119eb5a535..5560f7ac9796 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_exp10.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_exp10.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC exp10 -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_exp2.cl b/libclc/clc/lib/amdgpu/math/clc_half_exp2.cl index 9750d5008986..3e38cbee3285 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_exp2.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_exp2.cl @@ -10,6 +10,6 @@ #include #define __CLC_FUNC exp2 -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_log.cl b/libclc/clc/lib/amdgpu/math/clc_half_log.cl index ea19bd5a4c7a..0b731c202e20 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_log.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_log.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC log -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_log10.cl b/libclc/clc/lib/amdgpu/math/clc_half_log10.cl index af3a2695396d..1fd8ba520411 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_log10.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_log10.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC log10 -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_log2.cl b/libclc/clc/lib/amdgpu/math/clc_half_log2.cl index 81795f012c77..e4d27421f8fe 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_log2.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_log2.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC log2 -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_recip.cl b/libclc/clc/lib/amdgpu/math/clc_half_recip.cl index edbec07e4484..0a1c17037531 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_recip.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_recip.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC recip -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_rsqrt.cl b/libclc/clc/lib/amdgpu/math/clc_half_rsqrt.cl index c0a5489ec8fc..a17ac508b5ad 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_rsqrt.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_rsqrt.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC rsqrt -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_half_sqrt.cl b/libclc/clc/lib/amdgpu/math/clc_half_sqrt.cl index 4dc6fa31f21a..142a87881853 100644 --- a/libclc/clc/lib/amdgpu/math/clc_half_sqrt.cl +++ b/libclc/clc/lib/amdgpu/math/clc_half_sqrt.cl @@ -9,6 +9,6 @@ #include #define __CLC_FUNC sqrt -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_native_exp.cl b/libclc/clc/lib/amdgpu/math/clc_native_exp.cl index 591ecb0ac00b..946b9beed5be 100644 --- a/libclc/clc/lib/amdgpu/math/clc_native_exp.cl +++ b/libclc/clc/lib/amdgpu/math/clc_native_exp.cl @@ -11,5 +11,5 @@ #include #define __CLC_BODY -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_native_exp2.cl b/libclc/clc/lib/amdgpu/math/clc_native_exp2.cl index 63f28921dc17..5959ea2786a5 100644 --- a/libclc/clc/lib/amdgpu/math/clc_native_exp2.cl +++ b/libclc/clc/lib/amdgpu/math/clc_native_exp2.cl @@ -9,9 +9,9 @@ #include #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_MIN_VECSIZE 1 -#define FUNCTION __clc_native_exp2 -#define __IMPL_FUNCTION __builtin_amdgcn_exp2f +#define __CLC_FUNCTION __clc_native_exp2 +#define __CLC_IMPL_FUNCTION __builtin_amdgcn_exp2f #define __CLC_BODY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_native_log10.cl b/libclc/clc/lib/amdgpu/math/clc_native_log10.cl index 0668a635d24d..c356b22b6b6e 100644 --- a/libclc/clc/lib/amdgpu/math/clc_native_log10.cl +++ b/libclc/clc/lib/amdgpu/math/clc_native_log10.cl @@ -11,5 +11,5 @@ #include #define __CLC_BODY -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #include diff --git a/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl index b7cb635a2ae8..d2790f3a8260 100644 --- a/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl +++ b/libclc/clc/lib/amdgpu/math/clc_sqrt_fp64.cl @@ -43,8 +43,8 @@ _CLC_OVERLOAD _CLC_DEF double __clc_sqrt(double x) { return (x == __builtin_inf() || (x == 0.0)) ? v01 : v23; } -#define __DOUBLE_ONLY -#define FUNCTION __clc_sqrt +#define __CLC_DOUBLE_ONLY +#define __CLC_FUNCTION __clc_sqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/clspv/math/clc_sw_fma.cl b/libclc/clc/lib/clspv/math/clc_sw_fma.cl index 266269644721..1dc9a0e6407b 100644 --- a/libclc/clc/lib/clspv/math/clc_sw_fma.cl +++ b/libclc/clc/lib/clspv/math/clc_sw_fma.cl @@ -269,7 +269,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) { ((uint)st_fma.mantissa.lo & 0x7fffff)); } -#define __FLOAT_ONLY -#define FUNCTION __clc_sw_fma +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_sw_fma #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl index f35a9624fd01..7984dba5731e 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_dec.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_dec -#define __IMPL_FUNCTION __scoped_atomic_fetch_add +#define __CLC_FUNCTION __clc_atomic_dec +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_add #define __CLC_INC_DEC #define __CLC_BODY diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc index 2c45f49f6084..c1a0731eb843 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_def.inc +++ b/libclc/clc/lib/generic/atomic/clc_atomic_def.inc @@ -31,37 +31,37 @@ #ifdef __CLC_NO_VALUE_ARG #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ int MemoryScope) { \ - return __CLC_AS_RETTYPE(__IMPL_FUNCTION( \ + return __CLC_AS_RETTYPE(__CLC_IMPL_FUNCTION( \ (ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, MemoryOrder, MemoryScope)); \ } #elif defined(__CLC_INC_DEC) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, int MemoryOrder, \ int MemoryScope) { \ return __CLC_AS_RETTYPE( \ - __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, (__CLC_GENTYPE)1, \ - MemoryOrder, MemoryScope)); \ + __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, \ + (__CLC_GENTYPE)1, MemoryOrder, MemoryScope)); \ } #elif defined(__CLC_RETURN_VOID) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL void FUNCTION(volatile ADDRSPACE __CLC_GENTYPE *Ptr, \ - __CLC_GENTYPE Value, int MemoryOrder, \ - int MemoryScope) { \ - __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, MemoryOrder, \ - MemoryScope); \ + _CLC_OVERLOAD _CLC_DECL void __CLC_FUNCTION( \ + volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ + int MemoryOrder, int MemoryScope) { \ + __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \ + MemoryOrder, MemoryScope); \ } #else #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr, __CLC_GENTYPE Value, \ int MemoryOrder, int MemoryScope) { \ return __CLC_AS_RETTYPE( \ - __IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \ - MemoryOrder, MemoryScope)); \ + __CLC_IMPL_FUNCTION((ADDRSPACE __CLC_PTR_CASTTYPE *)Ptr, Value, \ + MemoryOrder, MemoryScope)); \ } #endif diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl index 52fd11afed6a..ee80256d3dbb 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_exchange.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_exchange -#define __IMPL_FUNCTION __scoped_atomic_exchange_n +#define __CLC_FUNCTION __clc_atomic_exchange +#define __CLC_IMPL_FUNCTION __scoped_atomic_exchange_n #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl index 0dc44919627b..06fa21344ec8 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_add.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_add -#define __IMPL_FUNCTION __scoped_atomic_fetch_add +#define __CLC_FUNCTION __clc_atomic_fetch_add +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_add #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl index ec89738bc0f6..7b9750d8d4db 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_and.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_and -#define __IMPL_FUNCTION __scoped_atomic_fetch_and +#define __CLC_FUNCTION __clc_atomic_fetch_and +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_and #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl index 0acac711aa96..6401d3e9439f 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_max.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_max -#define __IMPL_FUNCTION __scoped_atomic_fetch_max +#define __CLC_FUNCTION __clc_atomic_fetch_max +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_max #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl index 7a098588ec00..51e3904a5c6e 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_min.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_min -#define __IMPL_FUNCTION __scoped_atomic_fetch_min +#define __CLC_FUNCTION __clc_atomic_fetch_min +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_min #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl index e0f48fa40835..e92e351db3f4 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_or.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_or -#define __IMPL_FUNCTION __scoped_atomic_fetch_or +#define __CLC_FUNCTION __clc_atomic_fetch_or +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_or #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl index a4c2c1da1555..fbbf7a4def56 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_sub.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_sub -#define __IMPL_FUNCTION __scoped_atomic_fetch_sub +#define __CLC_FUNCTION __clc_atomic_fetch_sub +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_sub #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl index 4424a298178f..d041fd164d38 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_fetch_xor.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_fetch_xor -#define __IMPL_FUNCTION __scoped_atomic_fetch_xor +#define __CLC_FUNCTION __clc_atomic_fetch_xor +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_xor #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl index 019aa8d9d6dd..7171f84c30ce 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_inc.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_inc -#define __IMPL_FUNCTION __scoped_atomic_fetch_sub +#define __CLC_FUNCTION __clc_atomic_inc +#define __CLC_IMPL_FUNCTION __scoped_atomic_fetch_sub #define __CLC_INC_DEC #define __CLC_BODY diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl index 1f083073e43f..f7fe2510569e 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_load.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_load.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_load -#define __IMPL_FUNCTION __scoped_atomic_load_n +#define __CLC_FUNCTION __clc_atomic_load +#define __CLC_IMPL_FUNCTION __scoped_atomic_load_n #define __CLC_NO_VALUE_ARG #define __CLC_BODY diff --git a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl index 8fd165b9a83b..a93d21e8430c 100644 --- a/libclc/clc/lib/generic/atomic/clc_atomic_store.cl +++ b/libclc/clc/lib/generic/atomic/clc_atomic_store.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_atomic_store -#define __IMPL_FUNCTION __scoped_atomic_store_n +#define __CLC_FUNCTION __clc_atomic_store +#define __CLC_IMPL_FUNCTION __scoped_atomic_store_n #define __CLC_RETURN_VOID #define __CLC_BODY diff --git a/libclc/clc/lib/generic/common/clc_radians.cl b/libclc/clc/lib/generic/common/clc_radians.cl index f8a1635ecb5c..497263a15f51 100644 --- a/libclc/clc/lib/generic/common/clc_radians.cl +++ b/libclc/clc/lib/generic/common/clc_radians.cl @@ -9,27 +9,27 @@ #include #include -#define RADIANS_SINGLE_DEF(TYPE, LITERAL) \ +#define __CLC_RADIANS_SINGLE_DEF(TYPE, LITERAL) \ _CLC_OVERLOAD _CLC_DEF TYPE __clc_radians(TYPE radians) { \ return (TYPE)LITERAL * radians; \ } -#define RADIANS_DEF(TYPE, LITERAL) \ - RADIANS_SINGLE_DEF(TYPE, LITERAL) \ - RADIANS_SINGLE_DEF(TYPE##2, LITERAL) \ - RADIANS_SINGLE_DEF(TYPE##3, LITERAL) \ - RADIANS_SINGLE_DEF(TYPE##4, LITERAL) \ - RADIANS_SINGLE_DEF(TYPE##8, LITERAL) \ - RADIANS_SINGLE_DEF(TYPE##16, LITERAL) +#define __CLC_RADIANS_DEF(TYPE, LITERAL) \ + __CLC_RADIANS_SINGLE_DEF(TYPE, LITERAL) \ + __CLC_RADIANS_SINGLE_DEF(TYPE##2, LITERAL) \ + __CLC_RADIANS_SINGLE_DEF(TYPE##3, LITERAL) \ + __CLC_RADIANS_SINGLE_DEF(TYPE##4, LITERAL) \ + __CLC_RADIANS_SINGLE_DEF(TYPE##8, LITERAL) \ + __CLC_RADIANS_SINGLE_DEF(TYPE##16, LITERAL) // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F -RADIANS_DEF(float, 0x1.1df46ap-6F) +__CLC_RADIANS_DEF(float, 0x1.1df46ap-6F) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F -RADIANS_DEF(double, 0x1.1df46a2529d39p-6) +__CLC_RADIANS_DEF(double, 0x1.1df46a2529d39p-6) #endif @@ -37,6 +37,6 @@ RADIANS_DEF(double, 0x1.1df46a2529d39p-6) #pragma OPENCL EXTENSION cl_khr_fp16 : enable // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F -RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6) +__CLC_RADIANS_DEF(half, (half)0x1.1df46a2529d39p-6) #endif diff --git a/libclc/clc/lib/generic/geometric/clc_fast_distance.cl b/libclc/clc/lib/generic/geometric/clc_fast_distance.cl index 996039898480..6b645df0b3dd 100644 --- a/libclc/clc/lib/generic/geometric/clc_fast_distance.cl +++ b/libclc/clc/lib/generic/geometric/clc_fast_distance.cl @@ -9,6 +9,6 @@ #include #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/geometric/clc_fast_normalize.cl b/libclc/clc/lib/generic/geometric/clc_fast_normalize.cl index 85684d0f49bc..a5883efe7bd9 100644 --- a/libclc/clc/lib/generic/geometric/clc_fast_normalize.cl +++ b/libclc/clc/lib/generic/geometric/clc_fast_normalize.cl @@ -10,6 +10,6 @@ #include #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/integer/clc_add_sat.cl b/libclc/clc/lib/generic/integer/clc_add_sat.cl index 8241ee5d547a..08337bf533fc 100644 --- a/libclc/clc/lib/generic/integer/clc_add_sat.cl +++ b/libclc/clc/lib/generic/integer/clc_add_sat.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_add_sat -#define __IMPL_FUNCTION(x) __builtin_elementwise_add_sat +#define __CLC_FUNCTION __clc_add_sat +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_add_sat #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/integer/clc_bit_reverse.cl b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl index 439957383f58..9bf57e2c7d83 100644 --- a/libclc/clc/lib/generic/integer/clc_bit_reverse.cl +++ b/libclc/clc/lib/generic/integer/clc_bit_reverse.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_bit_reverse -#define __IMPL_FUNCTION(x) __builtin_elementwise_bitreverse +#define __CLC_FUNCTION __clc_bit_reverse +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_bitreverse #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/integer/clc_clz.cl b/libclc/clc/lib/generic/integer/clc_clz.cl index c6e1da680b7b..0d0c80b7cd1f 100644 --- a/libclc/clc/lib/generic/integer/clc_clz.cl +++ b/libclc/clc/lib/generic/integer/clc_clz.cl @@ -42,6 +42,6 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_clz(ulong x) { return x ? __builtin_clzl(x) : 64; } -#define FUNCTION __clc_clz +#define __CLC_FUNCTION __clc_clz #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/integer/clc_ctz.cl b/libclc/clc/lib/generic/integer/clc_ctz.cl index d82d99d539df..8dbebb3f345a 100644 --- a/libclc/clc/lib/generic/integer/clc_ctz.cl +++ b/libclc/clc/lib/generic/integer/clc_ctz.cl @@ -38,6 +38,6 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_ctz(ulong x) { return __builtin_ctzg(x, 64); } -#define FUNCTION __clc_ctz +#define __CLC_FUNCTION __clc_ctz #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/integer/clc_popcount.cl b/libclc/clc/lib/generic/integer/clc_popcount.cl index 078ceecc9769..2781ead638c9 100644 --- a/libclc/clc/lib/generic/integer/clc_popcount.cl +++ b/libclc/clc/lib/generic/integer/clc_popcount.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_popcount -#define __IMPL_FUNCTION(x) __builtin_elementwise_popcount +#define __CLC_FUNCTION __clc_popcount +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_popcount #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/integer/clc_sub_sat.cl b/libclc/clc/lib/generic/integer/clc_sub_sat.cl index e664f04b1748..8a7ffd06f8b0 100644 --- a/libclc/clc/lib/generic/integer/clc_sub_sat.cl +++ b/libclc/clc/lib/generic/integer/clc_sub_sat.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_sub_sat -#define __IMPL_FUNCTION(x) __builtin_elementwise_sub_sat +#define __CLC_FUNCTION __clc_sub_sat +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_sub_sat #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_atan2.inc b/libclc/clc/lib/generic/math/clc_atan2.inc index 61ffeebbc5d1..19eaaeee0092 100644 --- a/libclc/clc/lib/generic/math/clc_atan2.inc +++ b/libclc/clc/lib/generic/math/clc_atan2.inc @@ -131,8 +131,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2(__CLC_GENTYPE y, { __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063; __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5)); - q1 = USE_TABLE(atan_jby256_tbl_head, index - 16); - q2 = USE_TABLE(atan_jby256_tbl_tail, index - 16); + q1 = __CLC_USE_TABLE(atan_jby256_tbl_head, index - 16); + q2 = __CLC_USE_TABLE(atan_jby256_tbl_tail, index - 16); __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8; // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 diff --git a/libclc/clc/lib/generic/math/clc_atan2pi.inc b/libclc/clc/lib/generic/math/clc_atan2pi.inc index 79b2551e077c..9f901947887d 100644 --- a/libclc/clc/lib/generic/math/clc_atan2pi.inc +++ b/libclc/clc/lib/generic/math/clc_atan2pi.inc @@ -110,8 +110,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_atan2pi(__CLC_GENTYPE y, { __CLC_GENTYPE val = vbyu > 0.0625 ? vbyu : 0.063; __CLC_INTN index = __CLC_CONVERT_INTN(__clc_fma(256.0, val, 0.5)); - q1 = USE_TABLE(atan_jby256_tbl_head, (index - 16)); - q2 = USE_TABLE(atan_jby256_tbl_tail, (index - 16)); + q1 = __CLC_USE_TABLE(atan_jby256_tbl_head, (index - 16)); + q2 = __CLC_USE_TABLE(atan_jby256_tbl_tail, (index - 16)); __CLC_GENTYPE c = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-8; // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1 diff --git a/libclc/clc/lib/generic/math/clc_cbrt.inc b/libclc/clc/lib/generic/math/clc_cbrt.inc index 2aba2484e33f..ce0e0fe45316 100644 --- a/libclc/clc/lib/generic/math/clc_cbrt.inc +++ b/libclc/clc/lib/generic/math/clc_cbrt.inc @@ -32,7 +32,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cbrt(__CLC_GENTYPE x) { __CLC_AS_GENTYPE(indx | 0x3f000000); indx >>= 16; - __CLC_GENTYPE r = f * USE_TABLE(log_inv_tbl, __CLC_AS_INTN(indx)); + __CLC_GENTYPE r = f * __CLC_USE_TABLE(log_inv_tbl, __CLC_AS_INTN(indx)); __CLC_GENTYPE poly = __clc_mad(__clc_mad(r, 0x1.f9add4p-5f, -0x1.c71c72p-4f), r * r, r * 0x1.555556p-2f); @@ -52,8 +52,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cbrt(__CLC_GENTYPE x) { remH = rem == 2 ? 0x1.964000p+0f : remH; remT = rem == 2 ? 0x1.fea53ep-12f : remT; - __CLC_GENTYPE cbrtH = USE_TABLE(cbrt_tbl_head, __CLC_AS_INTN(indx)); - __CLC_GENTYPE cbrtT = USE_TABLE(cbrt_tbl_tail, __CLC_AS_INTN(indx)); + __CLC_GENTYPE cbrtH = __CLC_USE_TABLE(cbrt_tbl_head, __CLC_AS_INTN(indx)); + __CLC_GENTYPE cbrtT = __CLC_USE_TABLE(cbrt_tbl_tail, __CLC_AS_INTN(indx)); __CLC_GENTYPE bH = cbrtH * remH; __CLC_GENTYPE bT = @@ -97,7 +97,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cbrt(__CLC_GENTYPE x) { __CLC_GENTYPE F = __CLC_CONVERT_GENTYPE(index) * 0x1.0p-9; __CLC_GENTYPE f = Y - F; - __CLC_GENTYPE r = f * USE_TABLE(cbrt_inv_tbl, index - 256); + __CLC_GENTYPE r = f * __CLC_USE_TABLE(cbrt_inv_tbl, index - 256); __CLC_GENTYPE z = r * __clc_fma( @@ -112,11 +112,11 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cbrt(__CLC_GENTYPE x) { -0x1.c71c71c71c71cp-4), 0x1.5555555555555p-2); - __CLC_GENTYPE Rem_h = USE_TABLE(cbrt_rem_tbl_head, rem + 2); - __CLC_GENTYPE Rem_t = USE_TABLE(cbrt_rem_tbl_tail, rem + 2); + __CLC_GENTYPE Rem_h = __CLC_USE_TABLE(cbrt_rem_tbl_head, rem + 2); + __CLC_GENTYPE Rem_t = __CLC_USE_TABLE(cbrt_rem_tbl_tail, rem + 2); - __CLC_GENTYPE F_h = USE_TABLE(cbrt_dbl_tbl_head, index - 256); - __CLC_GENTYPE F_t = USE_TABLE(cbrt_dbl_tbl_tail, index - 256); + __CLC_GENTYPE F_h = __CLC_USE_TABLE(cbrt_dbl_tbl_head, index - 256); + __CLC_GENTYPE F_t = __CLC_USE_TABLE(cbrt_dbl_tbl_tail, index - 256); __CLC_GENTYPE b_h = F_h * Rem_h; __CLC_GENTYPE b_t = __clc_fma(Rem_t, F_h, __clc_fma(F_t, Rem_h, F_t * Rem_t)); diff --git a/libclc/clc/lib/generic/math/clc_ceil.cl b/libclc/clc/lib/generic/math/clc_ceil.cl index b06cb364b58d..e48ad1a482ef 100644 --- a/libclc/clc/lib/generic/math/clc_ceil.cl +++ b/libclc/clc/lib/generic/math/clc_ceil.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_ceil -#define __IMPL_FUNCTION(x) __builtin_elementwise_ceil +#define __CLC_FUNCTION __clc_ceil +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_ceil #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_copysign.cl b/libclc/clc/lib/generic/math/clc_copysign.cl index 06f126b16dfd..89829e2375ea 100644 --- a/libclc/clc/lib/generic/math/clc_copysign.cl +++ b/libclc/clc/lib/generic/math/clc_copysign.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_copysign -#define __IMPL_FUNCTION(x) __builtin_elementwise_copysign +#define __CLC_FUNCTION __clc_copysign +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_copysign #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_cosh.inc b/libclc/clc/lib/generic/math/clc_cosh.inc index a9fa1bb14140..c80b1a38c776 100644 --- a/libclc/clc/lib/generic/math/clc_cosh.inc +++ b/libclc/clc/lib/generic/math/clc_cosh.inc @@ -71,8 +71,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { 0.500000000000000005911074e0f); cdy = __clc_mad(cdy, dy2, 1.0f); - __CLC_GENTYPE sinhcoshh = USE_TABLE(sinhcosh_tbl_head, ind); - __CLC_GENTYPE sinhcosht = USE_TABLE(sinhcosh_tbl_tail, ind); + __CLC_GENTYPE sinhcoshh = __CLC_USE_TABLE(sinhcosh_tbl_head, ind); + __CLC_GENTYPE sinhcosht = __CLC_USE_TABLE(sinhcosh_tbl_tail, ind); __CLC_GENTYPE z = __clc_mad(sinhcoshh, sdy, sinhcosht * cdy); // When exp(-x) is insignificant compared to exp(x), return exp(x)/2 @@ -162,10 +162,10 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { // At this point sinh(dy) is approximated by dy + sdy, // and cosh(dy) is approximated by 1 + cdy. - __CLC_GENTYPE cl = USE_TABLE(cosh_tbl_head, ind); - __CLC_GENTYPE ct = USE_TABLE(cosh_tbl_tail, ind); - __CLC_GENTYPE sl = USE_TABLE(sinh_tbl_head, ind); - __CLC_GENTYPE st = USE_TABLE(sinh_tbl_tail, ind); + __CLC_GENTYPE cl = __CLC_USE_TABLE(cosh_tbl_head, ind); + __CLC_GENTYPE ct = __CLC_USE_TABLE(cosh_tbl_tail, ind); + __CLC_GENTYPE sl = __CLC_USE_TABLE(sinh_tbl_head, ind); + __CLC_GENTYPE st = __CLC_USE_TABLE(sinh_tbl_tail, ind); __CLC_GENTYPE z = __clc_fma( diff --git a/libclc/clc/lib/generic/math/clc_ep_log.inc b/libclc/clc/lib/generic/math/clc_ep_log.inc index f51e487bedca..d8cc886ffc74 100644 --- a/libclc/clc/lib/generic/math/clc_ep_log.inc +++ b/libclc/clc/lib/generic/math/clc_ep_log.inc @@ -58,8 +58,8 @@ _CLC_DEF _CLC_OVERLOAD void __clc_ep_log(__CLC_GENTYPE x, __CLC_GENTYPE f2 = f - f1; __CLC_GENTYPE u2 = MATH_DIVIDE(f2, __clc_fma(0.5, f2, f1)); - __CLC_GENTYPE z1 = USE_TABLE(ln_tbl_lo, (index - 64)); - __CLC_GENTYPE q = USE_TABLE(ln_tbl_hi, (index - 64)); + __CLC_GENTYPE z1 = __CLC_USE_TABLE(ln_tbl_lo, (index - 64)); + __CLC_GENTYPE q = __CLC_USE_TABLE(ln_tbl_hi, (index - 64)); z1 = near_one ? r : z1; q = near_one ? 0.0 : q; diff --git a/libclc/clc/lib/generic/math/clc_erf.cl b/libclc/clc/lib/generic/math/clc_erf.cl index bea392445c93..34c7d586131e 100644 --- a/libclc/clc/lib/generic/math/clc_erf.cl +++ b/libclc/clc/lib/generic/math/clc_erf.cl @@ -506,6 +506,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_erf(half x) { #endif -#define FUNCTION __clc_erf +#define __CLC_FUNCTION __clc_erf #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_erfc.cl b/libclc/clc/lib/generic/math/clc_erfc.cl index fab696004602..7922807818ea 100644 --- a/libclc/clc/lib/generic/math/clc_erfc.cl +++ b/libclc/clc/lib/generic/math/clc_erfc.cl @@ -517,6 +517,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_erfc(half x) { #endif -#define FUNCTION __clc_erfc +#define __CLC_FUNCTION __clc_erfc #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_exp10.inc b/libclc/clc/lib/generic/math/clc_exp10.inc index 96bc5331fef1..8bf4ec4c0b37 100644 --- a/libclc/clc/lib/generic/math/clc_exp10.inc +++ b/libclc/clc/lib/generic/math/clc_exp10.inc @@ -69,7 +69,7 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_exp10(__CLC_GENTYPE x) { 0x1.000000p-1f), r * r, r); - __CLC_GENTYPE two_to_jby64 = USE_TABLE(exp_tbl, j); + __CLC_GENTYPE two_to_jby64 = __CLC_USE_TABLE(exp_tbl, j); z2 = __clc_mad(two_to_jby64, z2, two_to_jby64); __CLC_GENTYPE z2s = z2 * __CLC_AS_GENTYPE((__CLC_UINTN)0x1 << (m + 149)); @@ -122,8 +122,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_exp10(__CLC_GENTYPE x) { 0x1.0000000000000p-1), 1.0); - __CLC_GENTYPE tv0 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE tv1 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE tv0 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE tv1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); z2 = __clc_fma(tv0 + tv1, z2, tv1) + tv0; __CLC_INTN small_value = diff --git a/libclc/clc/lib/generic/math/clc_exp_helper.cl b/libclc/clc/lib/generic/math/clc_exp_helper.cl index 92ff8f7fe4e6..1670b73b6c98 100644 --- a/libclc/clc/lib/generic/math/clc_exp_helper.cl +++ b/libclc/clc/lib/generic/math/clc_exp_helper.cl @@ -14,7 +14,7 @@ #include #include -#define __DOUBLE_ONLY +#define __CLC_DOUBLE_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_exp_helper.inc b/libclc/clc/lib/generic/math/clc_exp_helper.inc index 70ced7e9ea48..4ccf2d12b8cf 100644 --- a/libclc/clc/lib/generic/math/clc_exp_helper.inc +++ b/libclc/clc/lib/generic/math/clc_exp_helper.inc @@ -29,8 +29,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_exp_helper(__CLC_GENTYPE x, 0x1.0000000000000p-1), 1.0); - __CLC_GENTYPE tv0 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE tv1 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE tv0 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE tv1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); z2 = __clc_fma(tv0 + tv1, z2, tv1) + tv0; __CLC_INTN small_value = diff --git a/libclc/clc/lib/generic/math/clc_expm1.inc b/libclc/clc/lib/generic/math/clc_expm1.inc index 6abee9b3f0cc..a7ff90ea86ca 100644 --- a/libclc/clc/lib/generic/math/clc_expm1.inc +++ b/libclc/clc/lib/generic/math/clc_expm1.inc @@ -37,8 +37,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { r); __CLC_GENTYPE m2 = __CLC_AS_GENTYPE((m + EXPBIAS_SP32) << EXPSHIFTBITS_SP32); - __CLC_GENTYPE exp_head = USE_TABLE(exp_tbl_ep_head, j); - __CLC_GENTYPE exp_tail = USE_TABLE(exp_tbl_ep_tail, j); + __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j); __CLC_GENTYPE two_to_jby64_h = exp_head * m2; __CLC_GENTYPE two_to_jby64_t = exp_tail * m2; @@ -108,8 +108,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_expm1(__CLC_GENTYPE x) { __CLC_INTN j = n & 0x3f; __CLC_INTN m = n >> 6; - __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); __CLC_GENTYPE f = f1 + f2; __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(-n); diff --git a/libclc/clc/lib/generic/math/clc_fabs.cl b/libclc/clc/lib/generic/math/clc_fabs.cl index a0f0f3af07e2..e0c7685bfdb6 100644 --- a/libclc/clc/lib/generic/math/clc_fabs.cl +++ b/libclc/clc/lib/generic/math/clc_fabs.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_fabs -#define __IMPL_FUNCTION(x) __builtin_elementwise_abs +#define __CLC_FUNCTION __clc_fabs +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_abs #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_floor.cl b/libclc/clc/lib/generic/math/clc_floor.cl index a14adb98297e..f31bed2e9ee6 100644 --- a/libclc/clc/lib/generic/math/clc_floor.cl +++ b/libclc/clc/lib/generic/math/clc_floor.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_floor -#define __IMPL_FUNCTION(x) __builtin_elementwise_floor +#define __CLC_FUNCTION __clc_floor +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_floor #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_fmax.cl b/libclc/clc/lib/generic/math/clc_fmax.cl index b334207365b9..0fdbaa5eef5f 100644 --- a/libclc/clc/lib/generic/math/clc_fmax.cl +++ b/libclc/clc/lib/generic/math/clc_fmax.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_fmax -#define __IMPL_FUNCTION(x) __builtin_elementwise_maximumnum +#define __CLC_FUNCTION __clc_fmax +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_maximumnum #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_fmin.cl b/libclc/clc/lib/generic/math/clc_fmin.cl index d21bb8d07679..572d54b128a6 100644 --- a/libclc/clc/lib/generic/math/clc_fmin.cl +++ b/libclc/clc/lib/generic/math/clc_fmin.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_fmin -#define __IMPL_FUNCTION(x) __builtin_elementwise_minimumnum +#define __CLC_FUNCTION __clc_fmin +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_minimumnum #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_fmod.cl b/libclc/clc/lib/generic/math/clc_fmod.cl index 3162ef60d46e..3cb01e67292e 100644 --- a/libclc/clc/lib/generic/math/clc_fmod.cl +++ b/libclc/clc/lib/generic/math/clc_fmod.cl @@ -64,11 +64,11 @@ _CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) { return xr; } -#define __FLOAT_ONLY -#define FUNCTION __clc_fmod +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_fmod #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #ifdef cl_khr_fp64 @@ -176,11 +176,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) { return ret; } -#define __DOUBLE_ONLY -#define FUNCTION __clc_fmod +#define __CLC_DOUBLE_ONLY +#define __CLC_FUNCTION __clc_fmod #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif @@ -189,8 +189,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) { #pragma OPENCL EXTENSION cl_khr_fp16 : enable // Forward the half version of this builtin onto the float one -#define __HALF_ONLY -#define FUNCTION __clc_fmod +#define __CLC_HALF_ONLY +#define __CLC_FUNCTION __clc_fmod #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_fract.inc b/libclc/clc/lib/generic/math/clc_fract.inc index f0466e339b34..5b12c74f293f 100644 --- a/libclc/clc/lib/generic/math/clc_fract.inc +++ b/libclc/clc/lib/generic/math/clc_fract.inc @@ -23,7 +23,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_fract(__CLC_GENTYPE x, return r; } -#define FRACT_DEF(addrspace) \ +#define __CLC_FRACT_DEF(addrspace) \ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_fract( \ __CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ __CLC_GENTYPE private_iptr; \ @@ -32,10 +32,10 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_fract(__CLC_GENTYPE x, return ret; \ } -FRACT_DEF(local); -FRACT_DEF(global); +__CLC_FRACT_DEF(local); +__CLC_FRACT_DEF(global); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FRACT_DEF(generic); +__CLC_FRACT_DEF(generic); #endif #undef MIN_CONSTANT diff --git a/libclc/clc/lib/generic/math/clc_half_cos.cl b/libclc/clc/lib/generic/math/clc_half_cos.cl index 79a7ee7b483f..4420ad31afae 100644 --- a/libclc/clc/lib/generic/math/clc_half_cos.cl +++ b/libclc/clc/lib/generic/math/clc_half_cos.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_cos -#define __IMPL_FUNCTION(x) __clc_cos +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_cos +#define __CLC_IMPL_FUNCTION(x) __clc_cos #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_divide.cl b/libclc/clc/lib/generic/math/clc_half_divide.cl index 88676a4ac785..16af197d29a7 100644 --- a/libclc/clc/lib/generic/math/clc_half_divide.cl +++ b/libclc/clc/lib/generic/math/clc_half_divide.cl @@ -8,7 +8,7 @@ #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_exp.cl b/libclc/clc/lib/generic/math/clc_half_exp.cl index b53454580e3c..4b4717672d36 100644 --- a/libclc/clc/lib/generic/math/clc_half_exp.cl +++ b/libclc/clc/lib/generic/math/clc_half_exp.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_exp -#define __IMPL_FUNCTION(x) __clc_exp +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_exp +#define __CLC_IMPL_FUNCTION(x) __clc_exp #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_exp10.cl b/libclc/clc/lib/generic/math/clc_half_exp10.cl index 7bd107bf59e8..5bacca1ee0b7 100644 --- a/libclc/clc/lib/generic/math/clc_half_exp10.cl +++ b/libclc/clc/lib/generic/math/clc_half_exp10.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_exp10 -#define __IMPL_FUNCTION(x) __clc_exp10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_exp10 +#define __CLC_IMPL_FUNCTION(x) __clc_exp10 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_exp2.cl b/libclc/clc/lib/generic/math/clc_half_exp2.cl index a20d79bfff6e..9a8f2f980c5a 100644 --- a/libclc/clc/lib/generic/math/clc_half_exp2.cl +++ b/libclc/clc/lib/generic/math/clc_half_exp2.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_exp2 -#define __IMPL_FUNCTION(x) __clc_exp2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_exp2 +#define __CLC_IMPL_FUNCTION(x) __clc_exp2 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_log.cl b/libclc/clc/lib/generic/math/clc_half_log.cl index 26b2c756c988..160b193e8040 100644 --- a/libclc/clc/lib/generic/math/clc_half_log.cl +++ b/libclc/clc/lib/generic/math/clc_half_log.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_log -#define __IMPL_FUNCTION(x) __clc_log +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_log +#define __CLC_IMPL_FUNCTION(x) __clc_log #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_log10.cl b/libclc/clc/lib/generic/math/clc_half_log10.cl index 36f5bf55bb53..83d57db1bdf2 100644 --- a/libclc/clc/lib/generic/math/clc_half_log10.cl +++ b/libclc/clc/lib/generic/math/clc_half_log10.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_log10 -#define __IMPL_FUNCTION(x) __clc_log10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_log10 +#define __CLC_IMPL_FUNCTION(x) __clc_log10 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_log2.cl b/libclc/clc/lib/generic/math/clc_half_log2.cl index cbf1d350dd8e..7de0053fd750 100644 --- a/libclc/clc/lib/generic/math/clc_half_log2.cl +++ b/libclc/clc/lib/generic/math/clc_half_log2.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_log2 -#define __IMPL_FUNCTION(x) __clc_log2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_log2 +#define __CLC_IMPL_FUNCTION(x) __clc_log2 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_powr.cl b/libclc/clc/lib/generic/math/clc_half_powr.cl index f7ef2074b840..7065377ede66 100644 --- a/libclc/clc/lib/generic/math/clc_half_powr.cl +++ b/libclc/clc/lib/generic/math/clc_half_powr.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_powr -#define __IMPL_FUNCTION(x) __clc_powr +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_powr +#define __CLC_IMPL_FUNCTION(x) __clc_powr #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_recip.cl b/libclc/clc/lib/generic/math/clc_half_recip.cl index 0ae1e922d489..a988ad5e05a8 100644 --- a/libclc/clc/lib/generic/math/clc_half_recip.cl +++ b/libclc/clc/lib/generic/math/clc_half_recip.cl @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_rsqrt.cl b/libclc/clc/lib/generic/math/clc_half_rsqrt.cl index 7e5d9e052685..4e0cdd252f65 100644 --- a/libclc/clc/lib/generic/math/clc_half_rsqrt.cl +++ b/libclc/clc/lib/generic/math/clc_half_rsqrt.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_rsqrt -#define __IMPL_FUNCTION(x) __clc_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_rsqrt +#define __CLC_IMPL_FUNCTION(x) __clc_rsqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_sin.cl b/libclc/clc/lib/generic/math/clc_half_sin.cl index ef333dae3797..acc5d43156ab 100644 --- a/libclc/clc/lib/generic/math/clc_half_sin.cl +++ b/libclc/clc/lib/generic/math/clc_half_sin.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_sin -#define __IMPL_FUNCTION(x) __clc_sin +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_sin +#define __CLC_IMPL_FUNCTION(x) __clc_sin #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_sqrt.cl b/libclc/clc/lib/generic/math/clc_half_sqrt.cl index 293eb81196b6..01e779960df6 100644 --- a/libclc/clc/lib/generic/math/clc_half_sqrt.cl +++ b/libclc/clc/lib/generic/math/clc_half_sqrt.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_sqrt -#define __IMPL_FUNCTION(x) __clc_sqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_sqrt +#define __CLC_IMPL_FUNCTION(x) __clc_sqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_half_tan.cl b/libclc/clc/lib/generic/math/clc_half_tan.cl index ecc9f8f32831..1427fe02555f 100644 --- a/libclc/clc/lib/generic/math/clc_half_tan.cl +++ b/libclc/clc/lib/generic/math/clc_half_tan.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_half_tan -#define __IMPL_FUNCTION(x) __clc_tan +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_half_tan +#define __CLC_IMPL_FUNCTION(x) __clc_tan #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_ldexp.cl b/libclc/clc/lib/generic/math/clc_ldexp.cl index cb4185d89c72..a5327c58e424 100644 --- a/libclc/clc/lib/generic/math/clc_ldexp.cl +++ b/libclc/clc/lib/generic/math/clc_ldexp.cl @@ -131,7 +131,7 @@ _CLC_OVERLOAD _CLC_DEF_ldexp half __clc_ldexp(half x, int n) { #endif -#define FUNCTION __clc_ldexp +#define __CLC_FUNCTION __clc_ldexp #define __CLC_DEF_SPEC _CLC_DEF_ldexp #define __CLC_ARG2_TYPE int #define __CLC_BODY diff --git a/libclc/clc/lib/generic/math/clc_log.cl b/libclc/clc/lib/generic/math/clc_log.cl index cf5628f206ca..7eb0180de971 100644 --- a/libclc/clc/lib/generic/math/clc_log.cl +++ b/libclc/clc/lib/generic/math/clc_log.cl @@ -39,6 +39,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_log(half x) { #endif // cl_khr_fp16 -#define FUNCTION __clc_log +#define __CLC_FUNCTION __clc_log #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_log10.cl b/libclc/clc/lib/generic/math/clc_log10.cl index f5f0e8cc7083..35489f467562 100644 --- a/libclc/clc/lib/generic/math/clc_log10.cl +++ b/libclc/clc/lib/generic/math/clc_log10.cl @@ -22,6 +22,6 @@ #include "clc_log_base.h" #undef COMPILING_LOG10 -#define FUNCTION __clc_log10 +#define __CLC_FUNCTION __clc_log10 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_log1p.inc b/libclc/clc/lib/generic/math/clc_log1p.inc index 8c7dcfc48c18..2dd616818c71 100644 --- a/libclc/clc/lib/generic/math/clc_log1p.inc +++ b/libclc/clc/lib/generic/math/clc_log1p.inc @@ -56,7 +56,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_log1p(__CLC_GENTYPE x) { __CLC_GENTYPE f = mf > 24.0f ? fg24 : fl24; indx = indx >> 16; - __CLC_GENTYPE r = f * USE_TABLE(log_inv_tbl, __CLC_CONVERT_INTN(indx)); + __CLC_GENTYPE r = f * __CLC_USE_TABLE(log_inv_tbl, __CLC_CONVERT_INTN(indx)); // 1/3, 1/2 __CLC_GENTYPE poly = @@ -65,8 +65,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_log1p(__CLC_GENTYPE x) { const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 - __CLC_GENTYPE tv0 = USE_TABLE(loge_tbl_lo, __CLC_AS_INTN(indx)); - __CLC_GENTYPE tv1 = USE_TABLE(loge_tbl_hi, __CLC_AS_INTN(indx)); + __CLC_GENTYPE tv0 = __CLC_USE_TABLE(loge_tbl_lo, __CLC_AS_INTN(indx)); + __CLC_GENTYPE tv1 = __CLC_USE_TABLE(loge_tbl_hi, __CLC_AS_INTN(indx)); __CLC_GENTYPE z1 = __clc_mad(mf, LOG2_HEAD, tv0); __CLC_GENTYPE z2 = __clc_mad(mf, LOG2_TAIL, -poly) + tv1; __CLC_GENTYPE z = z1 + z2; @@ -108,8 +108,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_log1p(__CLC_GENTYPE x) { f2 = __CLC_CONVERT_LONGN(xexp <= -2 || (xexp >= MANTLENGTH_DP64 + 8)) ? f2temp : f2; - __CLC_GENTYPE z1 = USE_TABLE(ln_tbl_lo, j); - __CLC_GENTYPE q = USE_TABLE(ln_tbl_hi, j); + __CLC_GENTYPE z1 = __CLC_USE_TABLE(ln_tbl_lo, j); + __CLC_GENTYPE q = __CLC_USE_TABLE(ln_tbl_hi, j); __CLC_GENTYPE u = MATH_DIVIDE(f2, __clc_fma(0.5, f2, f1)); __CLC_GENTYPE v = u * u; diff --git a/libclc/clc/lib/generic/math/clc_log2.cl b/libclc/clc/lib/generic/math/clc_log2.cl index 335488af2f3d..d9d8cef54255 100644 --- a/libclc/clc/lib/generic/math/clc_log2.cl +++ b/libclc/clc/lib/generic/math/clc_log2.cl @@ -22,6 +22,6 @@ #include "clc_log_base.h" #undef COMPILING_LOG2 -#define FUNCTION __clc_log2 +#define __CLC_FUNCTION __clc_log2 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_log_base.h b/libclc/clc/lib/generic/math/clc_log_base.h index 9418535db827..ba92679dbaaf 100644 --- a/libclc/clc/lib/generic/math/clc_log_base.h +++ b/libclc/clc/lib/generic/math/clc_log_base.h @@ -142,21 +142,21 @@ __clc_log(float x) __clc_as_float(0x3f000000 | (xin & MANTBITS_SP32)); indx = indx >> 16; - r = f * USE_TABLE(log_inv_tbl, indx); + r = f * __CLC_USE_TABLE(log_inv_tbl, indx); // 1/3, 1/2 float poly = __clc_mad(__clc_mad(r, 0x1.555556p-2f, 0.5f), r * r, r); #if defined(COMPILING_LOG2) - float2 tv = USE_TABLE(log2_tbl, indx); + float2 tv = __CLC_USE_TABLE(log2_tbl, indx); z1 = tv.s0 + mf; z2 = __clc_mad(poly, -LOG2E, tv.s1); #elif defined(COMPILING_LOG10) - float2 tv = USE_TABLE(log10_tbl, indx); + float2 tv = __CLC_USE_TABLE(log10_tbl, indx); z1 = __clc_mad(mf, LOG10_2_HEAD, tv.s0); z2 = __clc_mad(poly, -LOG10E, mf * LOG10_2_TAIL) + tv.s1; #else - float2 tv = USE_TABLE(log_tbl, indx); + float2 tv = __CLC_USE_TABLE(log_tbl, indx); z1 = __clc_mad(mf, LOG2_HEAD, tv.s0); z2 = __clc_mad(mf, LOG2_TAIL, -poly) + tv.s1; #endif @@ -261,8 +261,8 @@ __clc_log(double x) int index = __clc_as_int2(ux).hi >> 13; index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); - double z1 = USE_TABLE(ln_tbl_lo, index - 64); - double q = USE_TABLE(ln_tbl_hi, index - 64); + double z1 = __CLC_USE_TABLE(ln_tbl_lo, index - 64); + double q = __CLC_USE_TABLE(ln_tbl_hi, index - 64); double f1 = index * 0x1.0p-7; double f2 = f - f1; diff --git a/libclc/clc/lib/generic/math/clc_modf.inc b/libclc/clc/lib/generic/math/clc_modf.inc index 962d859ce9f6..4344c65a12be 100644 --- a/libclc/clc/lib/generic/math/clc_modf.inc +++ b/libclc/clc/lib/generic/math/clc_modf.inc @@ -11,7 +11,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_modf(__CLC_GENTYPE x, return __clc_copysign(__clc_isinf(x) ? __CLC_FP_LIT(0.0) : x - *iptr, x); } -#define CLC_MODF_DEF(addrspace) \ +#define __CLC_MODF_DEF(addrspace) \ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_modf( \ __CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ __CLC_GENTYPE private_iptr; \ @@ -20,11 +20,11 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_modf(__CLC_GENTYPE x, return ret; \ } -CLC_MODF_DEF(local); -CLC_MODF_DEF(global); +__CLC_MODF_DEF(local); +__CLC_MODF_DEF(global); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_MODF_DEF(generic); +__CLC_MODF_DEF(generic); #endif -#undef CLC_MODF_DEF +#undef __CLC_MODF_DEF diff --git a/libclc/clc/lib/generic/math/clc_native_cos.cl b/libclc/clc/lib/generic/math/clc_native_cos.cl index 2e63e9c9493f..4e9a6c91b426 100644 --- a/libclc/clc/lib/generic/math/clc_native_cos.cl +++ b/libclc/clc/lib/generic/math/clc_native_cos.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_cos -#define __IMPL_FUNCTION(x) __builtin_elementwise_cos +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_cos +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_cos #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_divide.cl b/libclc/clc/lib/generic/math/clc_native_divide.cl index 005089b1ba15..abec9add4fd6 100644 --- a/libclc/clc/lib/generic/math/clc_native_divide.cl +++ b/libclc/clc/lib/generic/math/clc_native_divide.cl @@ -8,7 +8,7 @@ #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_exp.cl b/libclc/clc/lib/generic/math/clc_native_exp.cl index 2f50a0aefac9..63b4e5691397 100644 --- a/libclc/clc/lib/generic/math/clc_native_exp.cl +++ b/libclc/clc/lib/generic/math/clc_native_exp.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_exp -#define __IMPL_FUNCTION(x) __builtin_elementwise_exp +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_exp #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_exp10.cl b/libclc/clc/lib/generic/math/clc_native_exp10.cl index 7ca5d1992c61..2819cd59c4be 100644 --- a/libclc/clc/lib/generic/math/clc_native_exp10.cl +++ b/libclc/clc/lib/generic/math/clc_native_exp10.cl @@ -10,7 +10,7 @@ #include #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_exp2.cl b/libclc/clc/lib/generic/math/clc_native_exp2.cl index 25f557306c35..a995b0347cac 100644 --- a/libclc/clc/lib/generic/math/clc_native_exp2.cl +++ b/libclc/clc/lib/generic/math/clc_native_exp2.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_exp2 -#define __IMPL_FUNCTION(x) __builtin_elementwise_exp2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_exp2 +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_exp2 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_log.cl b/libclc/clc/lib/generic/math/clc_native_log.cl index b9b9d274f8b9..617908277fb6 100644 --- a/libclc/clc/lib/generic/math/clc_native_log.cl +++ b/libclc/clc/lib/generic/math/clc_native_log.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_log -#define __IMPL_FUNCTION(x) __builtin_elementwise_log +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_log #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_log10.cl b/libclc/clc/lib/generic/math/clc_native_log10.cl index 221aa406466c..0558f8cb7049 100644 --- a/libclc/clc/lib/generic/math/clc_native_log10.cl +++ b/libclc/clc/lib/generic/math/clc_native_log10.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_log10 -#define __IMPL_FUNCTION(x) __builtin_elementwise_log10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log10 +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_log10 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_log2.cl b/libclc/clc/lib/generic/math/clc_native_log2.cl index c3008ce242c7..c897ac07acb4 100644 --- a/libclc/clc/lib/generic/math/clc_native_log2.cl +++ b/libclc/clc/lib/generic/math/clc_native_log2.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_log2 -#define __IMPL_FUNCTION(x) __builtin_elementwise_log2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_log2 +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_log2 #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_powr.cl b/libclc/clc/lib/generic/math/clc_native_powr.cl index 7c7dcaf8c508..dac9b7edf78b 100644 --- a/libclc/clc/lib/generic/math/clc_native_powr.cl +++ b/libclc/clc/lib/generic/math/clc_native_powr.cl @@ -10,7 +10,7 @@ #include #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_recip.cl b/libclc/clc/lib/generic/math/clc_native_recip.cl index 4377f10b1543..75d832c8467e 100644 --- a/libclc/clc/lib/generic/math/clc_native_recip.cl +++ b/libclc/clc/lib/generic/math/clc_native_recip.cl @@ -8,7 +8,7 @@ #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_rsqrt.cl b/libclc/clc/lib/generic/math/clc_native_rsqrt.cl index d5e6fcdae491..9b88e72126f8 100644 --- a/libclc/clc/lib/generic/math/clc_native_rsqrt.cl +++ b/libclc/clc/lib/generic/math/clc_native_rsqrt.cl @@ -10,5 +10,5 @@ #include #define __CLC_BODY -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #include diff --git a/libclc/clc/lib/generic/math/clc_native_sin.cl b/libclc/clc/lib/generic/math/clc_native_sin.cl index 533f8d726d41..941342f387a1 100644 --- a/libclc/clc/lib/generic/math/clc_native_sin.cl +++ b/libclc/clc/lib/generic/math/clc_native_sin.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_sin -#define __IMPL_FUNCTION(x) __builtin_elementwise_sin +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_sin +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_sin #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_sqrt.cl b/libclc/clc/lib/generic/math/clc_native_sqrt.cl index e0f028e988b4..cc18a4b7ba1d 100644 --- a/libclc/clc/lib/generic/math/clc_native_sqrt.cl +++ b/libclc/clc/lib/generic/math/clc_native_sqrt.cl @@ -8,9 +8,9 @@ #include -#define __FLOAT_ONLY -#define FUNCTION __clc_native_sqrt -#define __IMPL_FUNCTION(x) __builtin_elementwise_sqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_sqrt +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_sqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_native_tan.cl b/libclc/clc/lib/generic/math/clc_native_tan.cl index a9ac46688c8f..6b69067be388 100644 --- a/libclc/clc/lib/generic/math/clc_native_tan.cl +++ b/libclc/clc/lib/generic/math/clc_native_tan.cl @@ -10,7 +10,7 @@ #include #include -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_nextafter.cl b/libclc/clc/lib/generic/math/clc_nextafter.cl index 6fc699ce711a..40e1b5a2c57f 100644 --- a/libclc/clc/lib/generic/math/clc_nextafter.cl +++ b/libclc/clc/lib/generic/math/clc_nextafter.cl @@ -14,67 +14,69 @@ // This file provides OpenCL C implementations of __clc_nextafter for // targets that don't support the clang builtin. -#define CLC_AS_TYPE(x) __clc_as_##x +#define __CLC_CLC_AS_TYPE(x) __clc_as_##x -#define NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE, INT_TYPE_SCALAR) \ +#define __CLC_NEXTAFTER(FLOAT_TYPE, UINT_TYPE, INT_TYPE, INT_TYPE_SCALAR) \ _CLC_OVERLOAD _CLC_DEF FLOAT_TYPE __clc_nextafter(FLOAT_TYPE x, \ FLOAT_TYPE y) { \ const UINT_TYPE sign_bit = (UINT_TYPE)1 \ << (sizeof(INT_TYPE_SCALAR) * 8 - 1); \ - UINT_TYPE ix = CLC_AS_TYPE(UINT_TYPE)(x); \ + UINT_TYPE ix = __CLC_CLC_AS_TYPE(UINT_TYPE)(x); \ FLOAT_TYPE absx = __clc_fabs(x); \ UINT_TYPE mxu = sign_bit - ix; \ - INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(mxu); \ - mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx \ - : CLC_AS_TYPE(INT_TYPE)(ix); \ - UINT_TYPE iy = CLC_AS_TYPE(UINT_TYPE)(y); \ + INT_TYPE mx = __CLC_CLC_AS_TYPE(INT_TYPE)(mxu); \ + mx = __CLC_CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 \ + ? mx \ + : __CLC_CLC_AS_TYPE(INT_TYPE)(ix); \ + UINT_TYPE iy = __CLC_CLC_AS_TYPE(UINT_TYPE)(y); \ FLOAT_TYPE absy = __clc_fabs(y); \ UINT_TYPE myu = sign_bit - iy; \ - INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(myu); \ - my = CLC_AS_TYPE(INT_TYPE)(iy) < (INT_TYPE)0 ? my \ - : CLC_AS_TYPE(INT_TYPE)(iy); \ - INT_TYPE t = mx + (mx < my ? (INT_TYPE)1 : (INT_TYPE)-1); \ - UINT_TYPE r = sign_bit - CLC_AS_TYPE(UINT_TYPE)(t); \ + INT_TYPE my = __CLC_CLC_AS_TYPE(INT_TYPE)(myu); \ + my = __CLC_CLC_AS_TYPE(INT_TYPE)(iy) < (INT_TYPE)0 \ + ? my \ + : __CLC_CLC_AS_TYPE(INT_TYPE)(iy); \ + INT_TYPE t = mx + (mx < my ? (INT_TYPE)1 : (INT_TYPE) - 1); \ + UINT_TYPE r = sign_bit - __CLC_CLC_AS_TYPE(UINT_TYPE)(t); \ r = (t < (INT_TYPE)0 || (t == (INT_TYPE)0 && mx < my)) \ ? r \ - : CLC_AS_TYPE(UINT_TYPE)(t); \ + : __CLC_CLC_AS_TYPE(UINT_TYPE)(t); \ r = __clc_isnan(x) ? ix : r; \ r = __clc_isnan(y) ? iy : r; \ - r = ((CLC_AS_TYPE(UINT_TYPE)(absx) | CLC_AS_TYPE(UINT_TYPE)(absy)) == \ - (UINT_TYPE)0 || \ + r = ((__CLC_CLC_AS_TYPE(UINT_TYPE)(absx) | \ + __CLC_CLC_AS_TYPE(UINT_TYPE)(absy)) == (UINT_TYPE)0 || \ ix == iy) \ ? iy \ : r; \ - return CLC_AS_TYPE(FLOAT_TYPE)(r); \ + return __CLC_CLC_AS_TYPE(FLOAT_TYPE)(r); \ } -NEXTAFTER(float, uint, int, int) -NEXTAFTER(float2, uint2, int2, int) -NEXTAFTER(float3, uint3, int3, int) -NEXTAFTER(float4, uint4, int4, int) -NEXTAFTER(float8, uint8, int8, int) -NEXTAFTER(float16, uint16, int16, int) +__CLC_NEXTAFTER(float, uint, int, int) +__CLC_NEXTAFTER(float2, uint2, int2, int) +__CLC_NEXTAFTER(float3, uint3, int3, int) +__CLC_NEXTAFTER(float4, uint4, int4, int) +__CLC_NEXTAFTER(float8, uint8, int8, int) +__CLC_NEXTAFTER(float16, uint16, int16, int) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -NEXTAFTER(double, ulong, long, long) -NEXTAFTER(double2, ulong2, long2, long) -NEXTAFTER(double3, ulong3, long3, long) -NEXTAFTER(double4, ulong4, long4, long) -NEXTAFTER(double8, ulong8, long8, long) -NEXTAFTER(double16, ulong16, long16, long) +__CLC_NEXTAFTER(double, ulong, long, long) +__CLC_NEXTAFTER(double2, ulong2, long2, long) +__CLC_NEXTAFTER(double3, ulong3, long3, long) +__CLC_NEXTAFTER(double4, ulong4, long4, long) +__CLC_NEXTAFTER(double8, ulong8, long8, long) +__CLC_NEXTAFTER(double16, ulong16, long16, long) #endif #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -NEXTAFTER(half, ushort, short, short) -NEXTAFTER(half2, ushort2, short2, short) -NEXTAFTER(half3, ushort3, short3, short) -NEXTAFTER(half4, ushort4, short4, short) -NEXTAFTER(half8, ushort8, short8, short) -NEXTAFTER(half16, ushort16, short16, short) +__CLC_NEXTAFTER(half, ushort, short, short) +__CLC_NEXTAFTER(half2, ushort2, short2, short) +__CLC_NEXTAFTER(half3, ushort3, short3, short) +__CLC_NEXTAFTER(half4, ushort4, short4, short) +__CLC_NEXTAFTER(half8, ushort8, short8, short) +__CLC_NEXTAFTER(half16, ushort16, short16, short) #endif diff --git a/libclc/clc/lib/generic/math/clc_pow.inc b/libclc/clc/lib/generic/math/clc_pow.inc index 8b1f820268ba..35cbcdae8fff 100644 --- a/libclc/clc/lib/generic/math/clc_pow.inc +++ b/libclc/clc/lib/generic/math/clc_pow.inc @@ -103,8 +103,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x, __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32)); indx = indx >> 16; - __CLC_GENTYPE rh = f * USE_TABLE(log_inv_tbl_ep_head, indx); - __CLC_GENTYPE rt = f * USE_TABLE(log_inv_tbl_ep_tail, indx); + __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx); + __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx); r = rh + rt; poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * @@ -113,8 +113,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x, const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; /* 0.693115234 */ const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */ - __CLC_GENTYPE logel = USE_TABLE(loge_tbl_lo, indx); - __CLC_GENTYPE logeh = USE_TABLE(loge_tbl_hi, indx); + __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx); + __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx); __CLC_GENTYPE lth = -r; __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh; __CLC_GENTYPE lt = lth + ltt; @@ -161,8 +161,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x, 0x1.000000p-1f), r * r, r); - __CLC_GENTYPE exp_head = USE_TABLE(exp_tbl_ep_head, j); - __CLC_GENTYPE exp_tail = USE_TABLE(exp_tbl_ep_tail, j); + __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j); __CLC_GENTYPE expylogx = __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head; @@ -275,8 +275,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x, __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L); __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L); __CLC_GENTYPE f = F - Y; - __CLC_GENTYPE log_h = USE_TABLE(log_f_inv_tbl_head, index); - __CLC_GENTYPE log_t = USE_TABLE(log_f_inv_tbl_tail, index); + __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index); + __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index); __CLC_GENTYPE f_inv = (log_h + log_t) * f; __CLC_GENTYPE r1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L); @@ -296,8 +296,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x, __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1; poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t; - log_h = USE_TABLE(powlog_tbl_head, index); - log_t = USE_TABLE(powlog_tbl_tail, index); + log_h = __CLC_USE_TABLE(powlog_tbl_head, index); + log_t = __CLC_USE_TABLE(powlog_tbl_tail, index); __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly; __CLC_GENTYPE resT = resT_t - poly0h; @@ -345,8 +345,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x, __CLC_INTN j = n & 0x0000003f; __CLC_INTN m = n >> 6; - __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); __CLC_GENTYPE f = f1 + f2; __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v); diff --git a/libclc/clc/lib/generic/math/clc_pown.inc b/libclc/clc/lib/generic/math/clc_pown.inc index 483fd2faf271..1a681b5e4b39 100644 --- a/libclc/clc/lib/generic/math/clc_pown.inc +++ b/libclc/clc/lib/generic/math/clc_pown.inc @@ -104,8 +104,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32)); indx = indx >> 16; - __CLC_GENTYPE rh = f * USE_TABLE(log_inv_tbl_ep_head, indx); - __CLC_GENTYPE rt = f * USE_TABLE(log_inv_tbl_ep_tail, indx); + __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx); + __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx); r = rh + rt; poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * @@ -114,8 +114,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 - __CLC_GENTYPE logel = USE_TABLE(loge_tbl_lo, indx); - __CLC_GENTYPE logeh = USE_TABLE(loge_tbl_hi, indx); + __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx); + __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx); __CLC_GENTYPE lth = -r; __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh; __CLC_GENTYPE lt = lth + ltt; @@ -162,8 +162,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, 0x1.000000p-1f), r * r, r); - __CLC_GENTYPE exp_head = USE_TABLE(exp_tbl_ep_head, j); - __CLC_GENTYPE exp_tail = USE_TABLE(exp_tbl_ep_tail, j); + __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j); __CLC_GENTYPE expylogx = __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head; @@ -256,8 +256,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L); __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L); __CLC_GENTYPE f = F - Y; - __CLC_GENTYPE log_h = USE_TABLE(log_f_inv_tbl_head, index); - __CLC_GENTYPE log_t = USE_TABLE(log_f_inv_tbl_tail, index); + __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index); + __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index); __CLC_GENTYPE f_inv = (log_h + log_t) * f; __CLC_GENTYPE r1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L); @@ -277,8 +277,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1; poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t; - log_h = USE_TABLE(powlog_tbl_head, index); - log_t = USE_TABLE(powlog_tbl_tail, index); + log_h = __CLC_USE_TABLE(powlog_tbl_head, index); + log_t = __CLC_USE_TABLE(powlog_tbl_tail, index); __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly; __CLC_GENTYPE resT = resT_t - poly0h; @@ -332,8 +332,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_INTN j = n & 0x0000003f; __CLC_INTN m = n >> 6; - __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); __CLC_GENTYPE f = f1 + f2; __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v); diff --git a/libclc/clc/lib/generic/math/clc_powr.inc b/libclc/clc/lib/generic/math/clc_powr.inc index 1244f7f6ac5d..b94dbfdcbdeb 100644 --- a/libclc/clc/lib/generic/math/clc_powr.inc +++ b/libclc/clc/lib/generic/math/clc_powr.inc @@ -102,8 +102,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x, __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32)); indx = indx >> 16; - __CLC_GENTYPE rh = f * USE_TABLE(log_inv_tbl_ep_head, indx); - __CLC_GENTYPE rt = f * USE_TABLE(log_inv_tbl_ep_tail, indx); + __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx); + __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx); r = rh + rt; poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) * @@ -112,8 +112,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x, const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f; // 0.693115234 const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 - __CLC_GENTYPE logel = USE_TABLE(loge_tbl_lo, indx); - __CLC_GENTYPE logeh = USE_TABLE(loge_tbl_hi, indx); + __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx); + __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx); __CLC_GENTYPE lth = -r; __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh; __CLC_GENTYPE lt = lth + ltt; @@ -159,8 +159,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x, 0x1.000000p-1f), r * r, r); - __CLC_GENTYPE exp_head = USE_TABLE(exp_tbl_ep_head, j); - __CLC_GENTYPE exp_tail = USE_TABLE(exp_tbl_ep_tail, j); + __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j); __CLC_GENTYPE expylogx = __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head; @@ -261,8 +261,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x, __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L); __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L); __CLC_GENTYPE f = F - Y; - __CLC_GENTYPE log_h = USE_TABLE(log_f_inv_tbl_head, index); - __CLC_GENTYPE log_t = USE_TABLE(log_f_inv_tbl_tail, index); + __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index); + __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index); __CLC_GENTYPE f_inv = (log_h + log_t) * f; __CLC_GENTYPE r1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L); @@ -282,8 +282,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x, __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1; poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t; - log_h = USE_TABLE(powlog_tbl_head, index); - log_t = USE_TABLE(powlog_tbl_tail, index); + log_h = __CLC_USE_TABLE(powlog_tbl_head, index); + log_t = __CLC_USE_TABLE(powlog_tbl_tail, index); __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly; __CLC_GENTYPE resT = resT_t - poly0h; @@ -331,8 +331,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x, __CLC_INTN j = n & 0x0000003f; __CLC_INTN m = n >> 6; - __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); __CLC_GENTYPE f = f1 + f2; __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v); diff --git a/libclc/clc/lib/generic/math/clc_remainder.cl b/libclc/clc/lib/generic/math/clc_remainder.cl index 2b3d185a8bee..488dde73f67a 100644 --- a/libclc/clc/lib/generic/math/clc_remainder.cl +++ b/libclc/clc/lib/generic/math/clc_remainder.cl @@ -74,11 +74,11 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remainder(float x, float y) { return xr; } -#define __FLOAT_ONLY -#define FUNCTION __clc_remainder +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_remainder #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #ifdef cl_khr_fp64 @@ -212,11 +212,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) { return ret; } -#define __DOUBLE_ONLY -#define FUNCTION __clc_remainder +#define __CLC_DOUBLE_ONLY +#define __CLC_FUNCTION __clc_remainder #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif @@ -225,8 +225,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) { #pragma OPENCL EXTENSION cl_khr_fp16 : enable // Forward the half version of this builtin onto the float one -#define __HALF_ONLY -#define FUNCTION __clc_remainder +#define __CLC_HALF_ONLY +#define __CLC_FUNCTION __clc_remainder #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_remquo.inc b/libclc/clc/lib/generic/math/clc_remquo.inc index 28f51c4e2fc2..3a76ffed7f03 100644 --- a/libclc/clc/lib/generic/math/clc_remquo.inc +++ b/libclc/clc/lib/generic/math/clc_remquo.inc @@ -74,7 +74,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, } // remquo signature is special, we don't have macro for this -#define __VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \ +#define __CLC_VEC_REMQUO(TYPE, VEC_SIZE, HALF_VEC_SIZE) \ _CLC_DEF _CLC_OVERLOAD TYPE##VEC_SIZE __clc_remquo( \ TYPE##VEC_SIZE x, TYPE##VEC_SIZE y, \ __CLC_ADDRESS_SPACE int##VEC_SIZE *quo) { \ @@ -87,7 +87,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, return ret; \ } -#define __VEC3_REMQUO(TYPE) \ +#define __CLC_VEC3_REMQUO(TYPE) \ _CLC_DEF _CLC_OVERLOAD TYPE##3 __clc_remquo( \ TYPE##3 x, TYPE##3 y, __CLC_ADDRESS_SPACE int##3 * quo) { \ int2 lo; \ @@ -99,11 +99,11 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y, (*quo).s2 = hi; \ return ret; \ } -__VEC_REMQUO(float, 2, ) -__VEC3_REMQUO(float) -__VEC_REMQUO(float, 4, 2) -__VEC_REMQUO(float, 8, 4) -__VEC_REMQUO(float, 16, 8) +__CLC_VEC_REMQUO(float, 2, ) +__CLC_VEC3_REMQUO(float) +__CLC_VEC_REMQUO(float, 4, 2) +__CLC_VEC_REMQUO(float, 8, 4) +__CLC_VEC_REMQUO(float, 16, 8) #ifdef cl_khr_fp64 @@ -246,11 +246,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y, *pquo = quo; return ret; } -__VEC_REMQUO(double, 2, ) -__VEC3_REMQUO(double) -__VEC_REMQUO(double, 4, 2) -__VEC_REMQUO(double, 8, 4) -__VEC_REMQUO(double, 16, 8) +__CLC_VEC_REMQUO(double, 2, ) +__CLC_VEC3_REMQUO(double) +__CLC_VEC_REMQUO(double, 4, 2) +__CLC_VEC_REMQUO(double, 8, 4) +__CLC_VEC_REMQUO(double, 16, 8) #endif @@ -262,10 +262,10 @@ _CLC_OVERLOAD _CLC_DEF half __clc_remquo(half x, half y, __CLC_ADDRESS_SPACE int *pquo) { return (half)__clc_remquo((float)x, (float)y, pquo); } -__VEC_REMQUO(half, 2, ) -__VEC3_REMQUO(half) -__VEC_REMQUO(half, 4, 2) -__VEC_REMQUO(half, 8, 4) -__VEC_REMQUO(half, 16, 8) +__CLC_VEC_REMQUO(half, 2, ) +__CLC_VEC3_REMQUO(half) +__CLC_VEC_REMQUO(half, 4, 2) +__CLC_VEC_REMQUO(half, 8, 4) +__CLC_VEC_REMQUO(half, 16, 8) #endif diff --git a/libclc/clc/lib/generic/math/clc_rint.cl b/libclc/clc/lib/generic/math/clc_rint.cl index b3f94490e3b5..e3aaae0ffec6 100644 --- a/libclc/clc/lib/generic/math/clc_rint.cl +++ b/libclc/clc/lib/generic/math/clc_rint.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_rint -#define __IMPL_FUNCTION(x) __builtin_elementwise_rint +#define __CLC_FUNCTION __clc_rint +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_rint #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_rootn.inc b/libclc/clc/lib/generic/math/clc_rootn.inc index 996f88f14535..fd3d0becb0df 100644 --- a/libclc/clc/lib/generic/math/clc_rootn.inc +++ b/libclc/clc/lib/generic/math/clc_rootn.inc @@ -103,8 +103,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32)); indx = indx >> 16; - __CLC_GENTYPE rh = f * USE_TABLE(log_inv_tbl_ep_head, indx); - __CLC_GENTYPE rt = f * USE_TABLE(log_inv_tbl_ep_tail, indx); + __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx); + __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx); ; r = rh + rt; @@ -116,9 +116,10 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833 __CLC_GENTYPE lth = -r; __CLC_GENTYPE ltt = - __clc_mad(mfn, LOG2_TAIL, -poly) + USE_TABLE(loge_tbl_hi, indx); + __clc_mad(mfn, LOG2_TAIL, -poly) + __CLC_USE_TABLE(loge_tbl_hi, indx); __CLC_GENTYPE lt = lth + ltt; - __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, USE_TABLE(loge_tbl_lo, indx)); + __CLC_GENTYPE lh = + __clc_mad(mfn, LOG2_HEAD, __CLC_USE_TABLE(loge_tbl_lo, indx)); __CLC_GENTYPE l = lh + lt; // Select near 1 or not @@ -165,8 +166,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, 0x1.000000p-1f), r * r, r); - __CLC_GENTYPE exph = USE_TABLE(exp_tbl_ep_head, j); - __CLC_GENTYPE expt = USE_TABLE(exp_tbl_ep_tail, j); + __CLC_GENTYPE exph = __CLC_USE_TABLE(exp_tbl_ep_head, j); + __CLC_GENTYPE expt = __CLC_USE_TABLE(exp_tbl_ep_tail, j); __CLC_GENTYPE expylogx = __clc_mad(exph, poly, __clc_mad(expt, poly, expt)) + exph; @@ -263,8 +264,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L); __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L); __CLC_GENTYPE f = F - Y; - __CLC_GENTYPE log_h = USE_TABLE(log_f_inv_tbl_head, index); - __CLC_GENTYPE log_t = USE_TABLE(log_f_inv_tbl_tail, index); + __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index); + __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index); __CLC_GENTYPE f_inv = (log_h + log_t) * f; __CLC_GENTYPE r1 = __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L); @@ -284,8 +285,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1; poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t; - log_h = USE_TABLE(powlog_tbl_head, index); - log_t = USE_TABLE(powlog_tbl_tail, index); + log_h = __CLC_USE_TABLE(powlog_tbl_head, index); + log_t = __CLC_USE_TABLE(powlog_tbl_tail, index); __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly; __CLC_GENTYPE resT = resT_t - poly0h; @@ -338,8 +339,8 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x, __CLC_INTN j = n & 0x0000003f; __CLC_INTN m = n >> 6; - __CLC_GENTYPE f1 = USE_TABLE(two_to_jby64_ep_tbl_head, j); - __CLC_GENTYPE f2 = USE_TABLE(two_to_jby64_ep_tbl_tail, j); + __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j); + __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j); __CLC_GENTYPE f = f1 + f2; __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v); diff --git a/libclc/clc/lib/generic/math/clc_round.cl b/libclc/clc/lib/generic/math/clc_round.cl index 8c298d8c3483..5ed8d08ba4f4 100644 --- a/libclc/clc/lib/generic/math/clc_round.cl +++ b/libclc/clc/lib/generic/math/clc_round.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_round -#define __IMPL_FUNCTION(x) __builtin_elementwise_round +#define __CLC_FUNCTION __clc_round +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_round #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl index 0ea1195fffa7..19705c42f6f0 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl @@ -18,16 +18,16 @@ #define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift)); -#define FULL_MUL(A, B, HI, LO) \ +#define __CLC_FULL_MUL(A, B, HI, LO) \ LO = A * B; \ HI = __clc_mul_hi(A, B) -#define FULL_MAD(A, B, C, HI, LO) \ +#define __CLC_FULL_MAD(A, B, C, HI, LO) \ LO = ((A) * (B) + (C)); \ HI = __clc_mul_hi(A, B); \ HI += LO < C ? 1U : 0U; -#define __FLOAT_ONLY +#define __CLC_FLOAT_ONLY #define __CLC_BODY #include @@ -45,7 +45,7 @@ ((__CLC_CONVERT_LONGN((src0)) << 32) | __CLC_CONVERT_LONGN((src1))) >> \ (((src2) & 3) * 8))) -#define __DOUBLE_ONLY +#define __CLC_DOUBLE_ONLY #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc index 29c742136b7e..bddc0998cf95 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc @@ -175,13 +175,13 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS( __CLC_UINTN p0, p1, p2, p3, p4, p5, p6, p7, c0, c1; - FULL_MUL(xm, b0, c0, p0); - FULL_MAD(xm, b1, c0, c1, p1); - FULL_MAD(xm, b2, c1, c0, p2); - FULL_MAD(xm, b3, c0, c1, p3); - FULL_MAD(xm, b4, c1, c0, p4); - FULL_MAD(xm, b5, c0, c1, p5); - FULL_MAD(xm, b6, c1, p7, p6); + __CLC_FULL_MUL(xm, b0, c0, p0); + __CLC_FULL_MAD(xm, b1, c0, c1, p1); + __CLC_FULL_MAD(xm, b2, c1, c0, p2); + __CLC_FULL_MAD(xm, b3, c0, c1, p3); + __CLC_FULL_MAD(xm, b4, c1, c0, p4); + __CLC_FULL_MAD(xm, b5, c0, c1, p5); + __CLC_FULL_MAD(xm, b6, c1, p7, p6); __CLC_UINTN fbits = (__CLC_UINTN)224 + (__CLC_UINTN)23 - __CLC_AS_UINTN(xe); diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc index 9b5776d9b05e..8fae90c9cc5a 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc @@ -67,11 +67,11 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary // byte boundary - __CLC_ULONGN q0 = USE_TABLE(pibits_tbl, j16); - __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 8)); - __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 16)); - __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 24)); - __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 32)); + __CLC_ULONGN q0 = __CLC_USE_TABLE(pibits_tbl, j16); + __CLC_ULONGN q1 = __CLC_USE_TABLE(pibits_tbl, (j16 + 8)); + __CLC_ULONGN q2 = __CLC_USE_TABLE(pibits_tbl, (j16 + 16)); + __CLC_ULONGN q3 = __CLC_USE_TABLE(pibits_tbl, (j16 + 24)); + __CLC_ULONGN q4 = __CLC_USE_TABLE(pibits_tbl, (j16 + 32)); __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0); __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32); diff --git a/libclc/clc/lib/generic/math/clc_sinh.inc b/libclc/clc/lib/generic/math/clc_sinh.inc index 799cc3210508..d39059166ffb 100644 --- a/libclc/clc/lib/generic/math/clc_sinh.inc +++ b/libclc/clc/lib/generic/math/clc_sinh.inc @@ -69,8 +69,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { 0.500000000000000005911074e0f); cdy = __clc_mad(cdy, dy2, 1.0f); - __CLC_GENTYPE sinhcoshh = USE_TABLE(sinhcosh_tbl_head, ind); - __CLC_GENTYPE sinhcosht = USE_TABLE(sinhcosh_tbl_tail, ind); + __CLC_GENTYPE sinhcoshh = __CLC_USE_TABLE(sinhcosh_tbl_head, ind); + __CLC_GENTYPE sinhcosht = __CLC_USE_TABLE(sinhcosh_tbl_tail, ind); __CLC_GENTYPE z = __clc_mad(sinhcosht, sdy, sinhcoshh * cdy); z = __CLC_AS_GENTYPE(xs | __CLC_AS_UINTN(z)); @@ -167,10 +167,10 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { __CLC_AS_GENTYPE(__CLC_AS_ULONGN(dy) & 0xfffffffff8000000UL); __CLC_GENTYPE sdy2 = sdy + (dy - sdy1); - __CLC_GENTYPE cl = USE_TABLE(cosh_tbl_head, ind); - __CLC_GENTYPE ct = USE_TABLE(cosh_tbl_tail, ind); - __CLC_GENTYPE sl = USE_TABLE(sinh_tbl_head, ind); - __CLC_GENTYPE st = USE_TABLE(sinh_tbl_tail, ind); + __CLC_GENTYPE cl = __CLC_USE_TABLE(cosh_tbl_head, ind); + __CLC_GENTYPE ct = __CLC_USE_TABLE(cosh_tbl_tail, ind); + __CLC_GENTYPE sl = __CLC_USE_TABLE(sinh_tbl_head, ind); + __CLC_GENTYPE st = __CLC_USE_TABLE(sinh_tbl_tail, ind); __CLC_GENTYPE z = __clc_fma(cl, sdy1, diff --git a/libclc/clc/lib/generic/math/clc_sw_fma.cl b/libclc/clc/lib/generic/math/clc_sw_fma.cl index ee4734078d69..550ca5e18f3f 100644 --- a/libclc/clc/lib/generic/math/clc_sw_fma.cl +++ b/libclc/clc/lib/generic/math/clc_sw_fma.cl @@ -160,7 +160,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) { ((uint)st_fma.mantissa & 0x7fffff)); } -#define __FLOAT_ONLY -#define FUNCTION __clc_sw_fma +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_sw_fma #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl index 7db00532c8be..67a17525014d 100644 --- a/libclc/clc/lib/generic/math/clc_tables.cl +++ b/libclc/clc/lib/generic/math/clc_tables.cl @@ -9,7 +9,7 @@ #include #include -DECLARE_TABLE(float, LOG_INV_TBL_EP_HEAD, 129) = { +__CLC_DECLARE_TABLE(float, LOG_INV_TBL_EP_HEAD, 129) = { 0x1.000000p+1f, 0x1.fc0000p+0f, 0x1.f80000p+0f, 0x1.f40000p+0f, 0x1.f00000p+0f, 0x1.ec0000p+0f, 0x1.e80000p+0f, 0x1.e40000p+0f, 0x1.e00000p+0f, 0x1.de0000p+0f, 0x1.da0000p+0f, 0x1.d60000p+0f, @@ -45,9 +45,9 @@ DECLARE_TABLE(float, LOG_INV_TBL_EP_HEAD, 129) = { 0x1.000000p+0f, }; -CLC_TABLE_FUNCTION(float, LOG_INV_TBL_EP_HEAD, log_inv_tbl_ep_head); +__CLC_TABLE_FUNCTION_VEC(float, LOG_INV_TBL_EP_HEAD, log_inv_tbl_ep_head); -DECLARE_TABLE(float, LOG_INV_TBL_EP_TAIL, 129) = { +__CLC_DECLARE_TABLE(float, LOG_INV_TBL_EP_TAIL, 129) = { 0x0.000000p+0f, 0x1.fc07f0p-14f, 0x1.f81f82p-12f, 0x1.196792p-10f, 0x1.f07c20p-10f, 0x1.80f660p-9f, 0x1.131ac0p-8f, 0x1.73ac90p-8f, 0x1.e1e1e2p-8f, 0x1.75b8fep-10f, 0x1.cc0ed8p-9f, 0x1.7b654cp-8f, @@ -82,9 +82,9 @@ DECLARE_TABLE(float, LOG_INV_TBL_EP_TAIL, 129) = { 0x1.041042p-12f, 0x1.091b52p-8f, 0x1.020408p-14f, 0x1.010102p-8f, 0x0.000000p+0f, }; -CLC_TABLE_FUNCTION(float, LOG_INV_TBL_EP_TAIL, log_inv_tbl_ep_tail); +__CLC_TABLE_FUNCTION_VEC(float, LOG_INV_TBL_EP_TAIL, log_inv_tbl_ep_tail); -DECLARE_TABLE(float, LOGE_TBL_LO, 129) = { +__CLC_DECLARE_TABLE(float, LOGE_TBL_LO, 129) = { 0x0.000000p+0f, 0x1.fe0000p-8f, 0x1.fc0000p-7f, 0x1.7b8000p-6f, 0x1.f82000p-6f, 0x1.39e000p-5f, 0x1.774000p-5f, 0x1.b42000p-5f, 0x1.f0a000p-5f, 0x1.164000p-4f, 0x1.340000p-4f, 0x1.51a000p-4f, @@ -120,7 +120,7 @@ DECLARE_TABLE(float, LOGE_TBL_LO, 129) = { 0x1.62e000p-1f, }; -DECLARE_TABLE(float, LOGE_TBL_HI, 129) = { +__CLC_DECLARE_TABLE(float, LOGE_TBL_HI, 129) = { 0x0.000000p+0f, 0x1.535882p-23f, 0x1.5161f8p-20f, 0x1.1b07d4p-18f, 0x1.361cf0p-19f, 0x1.0f73fcp-18f, 0x1.63d8cap-19f, 0x1.bae232p-18f, 0x1.86008ap-20f, 0x1.36eea2p-16f, 0x1.d7961ap-16f, 0x1.073f06p-16f, @@ -156,10 +156,10 @@ DECLARE_TABLE(float, LOGE_TBL_HI, 129) = { 0x1.0bfbe8p-15f, }; -CLC_TABLE_FUNCTION(float, LOGE_TBL_LO, loge_tbl_lo); -CLC_TABLE_FUNCTION(float, LOGE_TBL_HI, loge_tbl_hi); +__CLC_TABLE_FUNCTION_VEC(float, LOGE_TBL_LO, loge_tbl_lo); +__CLC_TABLE_FUNCTION_VEC(float, LOGE_TBL_HI, loge_tbl_hi); -DECLARE_TABLE(float, LOG_INV_TBL, 129) = { +__CLC_DECLARE_TABLE(float, LOG_INV_TBL, 129) = { 0x1.000000p+1f, 0x1.fc07f0p+0f, 0x1.f81f82p+0f, 0x1.f4465ap+0f, 0x1.f07c20p+0f, 0x1.ecc07cp+0f, 0x1.e9131ap+0f, 0x1.e573acp+0f, 0x1.e1e1e2p+0f, 0x1.de5d6ep+0f, 0x1.dae608p+0f, 0x1.d77b66p+0f, @@ -195,143 +195,142 @@ DECLARE_TABLE(float, LOG_INV_TBL, 129) = { 0x1.000000p+0f, }; -CLC_TABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl); +__CLC_TABLE_FUNCTION_VEC(float, LOG_INV_TBL, log_inv_tbl); -DECLARE_TABLE(float2, LOG2_TBL, 129) = { - (float2)(0x0.000000p+0f, 0x0.000000p+0f), - (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f), - (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f), - (float2)(0x1.118000p-5f, 0x1.347544p-15f), - (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f), - (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f), - (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f), - (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f), - (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f), - (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f), - (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f), - (float2)(0x1.e70000p-4f, 0x1.7608bep-15f), - (float2)(0x1.088000p-3f, 0x1.162336p-13f), - (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f), - (float2)(0x1.328000p-3f, 0x1.74f13cp-14f), - (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f), - (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f), - (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f), - (float2)(0x1.848000p-3f, 0x1.0af40ap-13f), - (float2)(0x1.988000p-3f, 0x1.b741dep-13f), - (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f), - (float2)(0x1.c08000p-3f, 0x1.6db376p-13f), - (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f), - (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f), - (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f), - (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f), - (float2)(0x1.110000p-2f, 0x1.83ed68p-13f), - (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f), - (float2)(0x1.240000p-2f, 0x1.01eac2p-12f), - (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f), - (float2)(0x1.370000p-2f, 0x1.24cea4p-14f), - (float2)(0x1.400000p-2f, 0x1.918ec6p-12f), - (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f), - (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f), - (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f), - (float2)(0x1.650000p-2f, 0x1.8fe466p-14f), - (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f), - (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f), - (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f), - (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f), - (float2)(0x1.918000p-2f, 0x1.dd448ep-13f), - (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f), - (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f), - (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f), - (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f), - (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f), - (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f), - (float2)(0x1.ce0000p-2f, 0x1.492474p-15f), - (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f), - (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f), - (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f), - (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f), - (float2)(0x1.f78000p-2f, 0x1.42b464p-13f), - (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f), - (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f), - (float2)(0x1.080000p-1f, 0x1.39e4fep-14f), - (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f), - (float2)(0x1.100000p-1f, 0x1.13b152p-13f), - (float2)(0x1.140000p-1f, 0x1.93f542p-14f), - (float2)(0x1.180000p-1f, 0x1.467b94p-16f), - (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f), - (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f), - (float2)(0x1.238000p-1f, 0x1.107508p-11f), - (float2)(0x1.278000p-1f, 0x1.2602c2p-12f), - (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f), - (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f), - (float2)(0x1.330000p-1f, 0x1.3e355ap-12f), - (float2)(0x1.368000p-1f, 0x1.cffedap-11f), - (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f), - (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f), - (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f), - (float2)(0x1.458000p-1f, 0x1.cea628p-11f), - (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f), - (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f), - (float2)(0x1.510000p-1f, 0x1.18708ap-17f), - (float2)(0x1.548000p-1f, 0x1.374652p-12f), - (float2)(0x1.580000p-1f, 0x1.2089a6p-11f), - (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f), - (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f), - (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f), - (float2)(0x1.668000p-1f, 0x1.004722p-12f), - (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f), - (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f), - (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f), - (float2)(0x1.748000p-1f, 0x1.8feb26p-12f), - (float2)(0x1.780000p-1f, 0x1.5edfeep-12f), - (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f), - (float2)(0x1.7f0000p-1f, 0x1.322182p-13f), - (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f), - (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f), - (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f), - (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f), - (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f), - (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f), - (float2)(0x1.968000p-1f, 0x1.ed6956p-12f), - (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f), - (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f), - (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f), - (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f), - (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f), - (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f), - (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f), - (float2)(0x1.b08000p-1f, 0x1.9049aep-11f), - (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f), - (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f), - (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f), - (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f), - (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f), - (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f), - (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f), - (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f), - (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f), - (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f), - (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f), - (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f), - (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f), - (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f), - (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f), - (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f), - (float2)(0x1.e58000p-1f, 0x1.063c88p-13f), - (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f), - (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f), - (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f), - (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f), - (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f), - (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f), - (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f), - (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f), - (float2)(0x1.000000p+0f, 0x0.000000p+0f) -}; +__CLC_DECLARE_TABLE(float2, LOG2_TBL, + 129) = {(float2)(0x0.000000p+0f, 0x0.000000p+0f), + (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f), + (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f), + (float2)(0x1.118000p-5f, 0x1.347544p-15f), + (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f), + (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f), + (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f), + (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f), + (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f), + (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f), + (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f), + (float2)(0x1.e70000p-4f, 0x1.7608bep-15f), + (float2)(0x1.088000p-3f, 0x1.162336p-13f), + (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f), + (float2)(0x1.328000p-3f, 0x1.74f13cp-14f), + (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f), + (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f), + (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f), + (float2)(0x1.848000p-3f, 0x1.0af40ap-13f), + (float2)(0x1.988000p-3f, 0x1.b741dep-13f), + (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f), + (float2)(0x1.c08000p-3f, 0x1.6db376p-13f), + (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f), + (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f), + (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f), + (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f), + (float2)(0x1.110000p-2f, 0x1.83ed68p-13f), + (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f), + (float2)(0x1.240000p-2f, 0x1.01eac2p-12f), + (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f), + (float2)(0x1.370000p-2f, 0x1.24cea4p-14f), + (float2)(0x1.400000p-2f, 0x1.918ec6p-12f), + (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f), + (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f), + (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f), + (float2)(0x1.650000p-2f, 0x1.8fe466p-14f), + (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f), + (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f), + (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f), + (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f), + (float2)(0x1.918000p-2f, 0x1.dd448ep-13f), + (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f), + (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f), + (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f), + (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f), + (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f), + (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f), + (float2)(0x1.ce0000p-2f, 0x1.492474p-15f), + (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f), + (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f), + (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f), + (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f), + (float2)(0x1.f78000p-2f, 0x1.42b464p-13f), + (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f), + (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f), + (float2)(0x1.080000p-1f, 0x1.39e4fep-14f), + (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f), + (float2)(0x1.100000p-1f, 0x1.13b152p-13f), + (float2)(0x1.140000p-1f, 0x1.93f542p-14f), + (float2)(0x1.180000p-1f, 0x1.467b94p-16f), + (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f), + (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f), + (float2)(0x1.238000p-1f, 0x1.107508p-11f), + (float2)(0x1.278000p-1f, 0x1.2602c2p-12f), + (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f), + (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f), + (float2)(0x1.330000p-1f, 0x1.3e355ap-12f), + (float2)(0x1.368000p-1f, 0x1.cffedap-11f), + (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f), + (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f), + (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f), + (float2)(0x1.458000p-1f, 0x1.cea628p-11f), + (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f), + (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f), + (float2)(0x1.510000p-1f, 0x1.18708ap-17f), + (float2)(0x1.548000p-1f, 0x1.374652p-12f), + (float2)(0x1.580000p-1f, 0x1.2089a6p-11f), + (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f), + (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f), + (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f), + (float2)(0x1.668000p-1f, 0x1.004722p-12f), + (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f), + (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f), + (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f), + (float2)(0x1.748000p-1f, 0x1.8feb26p-12f), + (float2)(0x1.780000p-1f, 0x1.5edfeep-12f), + (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f), + (float2)(0x1.7f0000p-1f, 0x1.322182p-13f), + (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f), + (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f), + (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f), + (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f), + (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f), + (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f), + (float2)(0x1.968000p-1f, 0x1.ed6956p-12f), + (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f), + (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f), + (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f), + (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f), + (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f), + (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f), + (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f), + (float2)(0x1.b08000p-1f, 0x1.9049aep-11f), + (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f), + (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f), + (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f), + (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f), + (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f), + (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f), + (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f), + (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f), + (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f), + (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f), + (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f), + (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f), + (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f), + (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f), + (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f), + (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f), + (float2)(0x1.e58000p-1f, 0x1.063c88p-13f), + (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f), + (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f), + (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f), + (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f), + (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f), + (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f), + (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f), + (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f), + (float2)(0x1.000000p+0f, 0x0.000000p+0f)}; -TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl); +__CLC_TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl); -DECLARE_TABLE(float2, LOG10_TBL, 129) = { +__CLC_DECLARE_TABLE(float2, LOG10_TBL, 129) = { (float2)(0x0.000000p+0f, 0x0.000000p+0f), (float2)(0x1.ba8000p-9f, 0x1.f51c88p-19f), (float2)(0x1.b90000p-8f, 0x1.1da93ep-18f), @@ -463,10 +462,9 @@ DECLARE_TABLE(float2, LOG10_TBL, 129) = { (float2)(0x1.340000p-2f, 0x1.04d426p-12f), }; -TABLE_FUNCTION(float2, LOG10_TBL, log10_tbl); +__CLC_TABLE_FUNCTION(float2, LOG10_TBL, log10_tbl); - -DECLARE_TABLE(float, EXP_TBL, 65) = { +__CLC_DECLARE_TABLE(float, EXP_TBL, 65) = { 0x1.000000p+0f, 0x1.02c9a4p+0f, 0x1.059b0ep+0f, 0x1.087452p+0f, 0x1.0b5586p+0f, 0x1.0e3ec4p+0f, 0x1.11301ep+0f, 0x1.1429aap+0f, 0x1.172b84p+0f, 0x1.1a35bep+0f, 0x1.1d4874p+0f, 0x1.2063b8p+0f, @@ -486,9 +484,9 @@ DECLARE_TABLE(float, EXP_TBL, 65) = { 0x1.000000p+1f, }; -CLC_TABLE_FUNCTION(float, EXP_TBL, exp_tbl); +__CLC_TABLE_FUNCTION_VEC(float, EXP_TBL, exp_tbl); -DECLARE_TABLE(float, EXP_TBL_EP_HEAD, 65) = { +__CLC_DECLARE_TABLE(float, EXP_TBL_EP_HEAD, 65) = { 0x1.000000p+0f, 0x1.02c000p+0f, 0x1.058000p+0f, 0x1.084000p+0f, 0x1.0b4000p+0f, 0x1.0e0000p+0f, 0x1.110000p+0f, 0x1.140000p+0f, 0x1.170000p+0f, 0x1.1a0000p+0f, 0x1.1d4000p+0f, 0x1.204000p+0f, @@ -508,9 +506,9 @@ DECLARE_TABLE(float, EXP_TBL_EP_HEAD, 65) = { 0x1.000000p+1f, }; -CLC_TABLE_FUNCTION(float, EXP_TBL_EP_HEAD, exp_tbl_ep_head); +__CLC_TABLE_FUNCTION_VEC(float, EXP_TBL_EP_HEAD, exp_tbl_ep_head); -DECLARE_TABLE(float, EXP_TBL_EP_TAIL, 65) = { +__CLC_DECLARE_TABLE(float, EXP_TBL_EP_TAIL, 65) = { 0x0.000000p+0f, 0x1.347ceep-13f, 0x1.b0d314p-12f, 0x1.a28c3ap-11f, 0x1.586cf8p-12f, 0x1.f61968p-11f, 0x1.80e808p-11f, 0x1.4d5754p-11f, 0x1.5c1e3ep-11f, 0x1.adf5b6p-11f, 0x1.0e62d0p-13f, 0x1.1dc430p-11f, @@ -530,9 +528,9 @@ DECLARE_TABLE(float, EXP_TBL_EP_TAIL, 65) = { 0x0.000000p+0f, }; -CLC_TABLE_FUNCTION(float, EXP_TBL_EP_TAIL, exp_tbl_ep_tail); +__CLC_TABLE_FUNCTION_VEC(float, EXP_TBL_EP_TAIL, exp_tbl_ep_tail); -DECLARE_TABLE(float, CBRT_TBL_HEAD, 129) = { +__CLC_DECLARE_TABLE(float, CBRT_TBL_HEAD, 129) = { 0x1.000000p+0f, 0x1.008000p+0f, 0x1.014000p+0f, 0x1.01c000p+0f, 0x1.028000p+0f, 0x1.034000p+0f, 0x1.03c000p+0f, 0x1.048000p+0f, 0x1.050000p+0f, 0x1.05c000p+0f, 0x1.068000p+0f, 0x1.070000p+0f, @@ -568,9 +566,9 @@ DECLARE_TABLE(float, CBRT_TBL_HEAD, 129) = { 0x1.428000p+0f, }; -CLC_TABLE_FUNCTION(float, CBRT_TBL_HEAD, cbrt_tbl_head); +__CLC_TABLE_FUNCTION_VEC(float, CBRT_TBL_HEAD, cbrt_tbl_head); -DECLARE_TABLE(float, CBRT_TBL_TAIL, 129) = { +__CLC_DECLARE_TABLE(float, CBRT_TBL_TAIL, 129) = { 0x0.000000p+0f, 0x1.51cb0ap-11f, 0x1.39221ep-12f, 0x1.e06908p-11f, 0x1.1d6978p-11f, 0x1.4ea1bep-13f, 0x1.833b8ep-11f, 0x1.587002p-12f, 0x1.ceb290p-11f, 0x1.d57f34p-12f, 0x1.cc53acp-21f, 0x1.0fe098p-11f, @@ -606,10 +604,10 @@ DECLARE_TABLE(float, CBRT_TBL_TAIL, 129) = { 0x1.45f31ap-13f, }; -CLC_TABLE_FUNCTION(float, CBRT_TBL_TAIL, cbrt_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(float, CBRT_TBL_TAIL, cbrt_tbl_tail); // Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. -DECLARE_TABLE(float, SINHCOSH_TBL_HEAD, 37) = { +__CLC_DECLARE_TABLE(float, SINHCOSH_TBL_HEAD, 37) = { 0x0.000000p+0f, 0x1.2cd9fcp+0f, 0x1.d03cf6p+1f, 0x1.40926ep+3f, 0x1.b4a380p+4f, 0x1.28d016p+6f, 0x1.936d22p+7f, 0x1.122876p+9f, 0x1.749ea6p+10f, 0x1.fa7158p+11f, 0x1.5829dcp+13f, 0x1.d3c448p+14f, @@ -622,9 +620,9 @@ DECLARE_TABLE(float, SINHCOSH_TBL_HEAD, 37) = { 0x1.ea215ap+50f, }; -CLC_TABLE_FUNCTION(float, SINHCOSH_TBL_HEAD, sinhcosh_tbl_head); +__CLC_TABLE_FUNCTION_VEC(float, SINHCOSH_TBL_HEAD, sinhcosh_tbl_head); -DECLARE_TABLE(float, SINHCOSH_TBL_TAIL, 37) = { +__CLC_DECLARE_TABLE(float, SINHCOSH_TBL_TAIL, 37) = { 0x1.000000p+0f, 0x1.8b0756p+0f, 0x1.e18fa0p+1f, 0x1.422a4ap+3f, 0x1.b4ee86p+4f, 0x1.28d6fcp+6f, 0x1.936e68p+7f, 0x1.122894p+9f, 0x1.749eaap+10f, 0x1.fa7158p+11f, 0x1.5829dep+13f, 0x1.d3c448p+14f, @@ -637,13 +635,13 @@ DECLARE_TABLE(float, SINHCOSH_TBL_TAIL, 37) = { 0x1.ea215ap+50f, }; -CLC_TABLE_FUNCTION(float, SINHCOSH_TBL_TAIL, sinhcosh_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(float, SINHCOSH_TBL_TAIL, sinhcosh_tbl_tail); #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -DECLARE_TABLE(double, LN_TBL_LO, 65) = { +__CLC_DECLARE_TABLE(double, LN_TBL_LO, 65) = { 0x0.0000000000000p+0, 0x1.fc0a800000000p-7, 0x1.f829800000000p-6, 0x1.7745800000000p-5, 0x1.f0a3000000000p-5, 0x1.341d700000000p-4, 0x1.6f0d200000000p-4, 0x1.a926d00000000p-4, 0x1.e270700000000p-4, @@ -668,9 +666,9 @@ DECLARE_TABLE(double, LN_TBL_LO, 65) = { 0x1.5ee02a0000000p-1, 0x1.62e42e0000000p-1, }; -CLC_TABLE_FUNCTION(double, LN_TBL_LO, ln_tbl_lo); +__CLC_TABLE_FUNCTION_VEC(double, LN_TBL_LO, ln_tbl_lo); -DECLARE_TABLE(double, LN_TBL_HI, 65) = { +__CLC_DECLARE_TABLE(double, LN_TBL_HI, 65) = { 0x0.0000000000000p+0, 0x1.61f807c79f3dbp-28, 0x1.873c1980267c8p-25, 0x1.ec65b9f88c69ep-26, 0x1.8022c54cc2f99p-26, 0x1.2c37a3a125330p-25, 0x1.15cad69737c93p-25, 0x1.d256ab1b285e9p-27, 0x1.b8abcb97a7aa2p-26, @@ -695,14 +693,14 @@ DECLARE_TABLE(double, LN_TBL_HI, 65) = { 0x1.2482ceae1ac12p-26, 0x1.efa39ef35793cp-25, }; -CLC_TABLE_FUNCTION(double, LN_TBL_HI, ln_tbl_hi); +__CLC_TABLE_FUNCTION_VEC(double, LN_TBL_HI, ln_tbl_hi); // Arrays atan_jby256_head and atan_jby256_tail contain leading and trailing // parts respectively of precomputed values of atan(j/256), for j = 16, 17, ..., // 256. atan_jby256_head contains the first 21 bits of precision, and // atan_jby256_tail contains a further 53 bits precision. -DECLARE_TABLE(double, ATAN_JBY256_TBL_HEAD, 241) = { +__CLC_DECLARE_TABLE(double, ATAN_JBY256_TBL_HEAD, 241) = { 0x1.ff55b00000000p-5, 0x1.0f99e00000000p-4, 0x1.1f86d00000000p-4, 0x1.2f71900000000p-4, 0x1.3f59f00000000p-4, 0x1.4f3fd00000000p-4, 0x1.5f23200000000p-4, 0x1.6f03b00000000p-4, 0x1.7ee1800000000p-4, @@ -786,9 +784,9 @@ DECLARE_TABLE(double, ATAN_JBY256_TBL_HEAD, 241) = { 0x1.921fb00000000p-1, }; -CLC_TABLE_FUNCTION(double, ATAN_JBY256_TBL_HEAD, atan_jby256_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, ATAN_JBY256_TBL_HEAD, atan_jby256_tbl_head); -DECLARE_TABLE(double, ATAN_JBY256_TBL_TAIL, 241) = { +__CLC_DECLARE_TABLE(double, ATAN_JBY256_TBL_TAIL, 241) = { 0x1.6e59fbd38db2cp-26, 0x1.4e3aa54dedf96p-25, 0x1.7e105ab1bda88p-25, 0x1.8c5254d013fd0p-27, 0x1.cf8ab3ad62670p-29, 0x1.9dca4bec80468p-26, 0x1.3f4b5ec98a8dap-26, 0x1.b9d49619d81fep-25, 0x1.3017887460934p-27, @@ -872,9 +870,9 @@ DECLARE_TABLE(double, ATAN_JBY256_TBL_TAIL, 241) = { 0x1.5110b4611a626p-23, }; -CLC_TABLE_FUNCTION(double, ATAN_JBY256_TBL_TAIL, atan_jby256_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, ATAN_JBY256_TBL_TAIL, atan_jby256_tbl_tail); -DECLARE_TABLE(double, LOG_F_INV_TBL_HEAD, 258) = { +__CLC_DECLARE_TABLE(double, LOG_F_INV_TBL_HEAD, 258) = { 0x1.0000000000000p+1, 0x1.fe00000000000p+0, 0x1.fc00000000000p+0, 0x1.fa00000000000p+0, 0x1.f800000000000p+0, 0x1.f600000000000p+0, 0x1.f400000000000p+0, 0x1.f200000000000p+0, 0x1.f000000000000p+0, @@ -963,9 +961,9 @@ DECLARE_TABLE(double, LOG_F_INV_TBL_HEAD, 258) = { 0x1.0000000000000p+0, 0x1.0000000000000p+0, }; -CLC_TABLE_FUNCTION(double, LOG_F_INV_TBL_HEAD, log_f_inv_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, LOG_F_INV_TBL_HEAD, log_f_inv_tbl_head); -DECLARE_TABLE(double, LOG_F_INV_TBL_TAIL, 258) = { +__CLC_DECLARE_TABLE(double, LOG_F_INV_TBL_TAIL, 258) = { 0x0.0000000000000p+0, 0x1.fe01fe01fe020p-16, 0x1.fc07f01fc07f0p-14, 0x1.1caa01fa11caap-12, 0x1.f81f81f81f820p-12, 0x1.8856506ddaba6p-11, 0x1.196792909c560p-10, 0x1.7d9108c2ad433p-10, 0x1.f07c1f07c1f08p-10, @@ -1054,9 +1052,9 @@ DECLARE_TABLE(double, LOG_F_INV_TBL_TAIL, 258) = { 0x1.0080402010080p-9, 0x0.0000000000000p+0, }; -CLC_TABLE_FUNCTION(double, LOG_F_INV_TBL_TAIL, log_f_inv_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, LOG_F_INV_TBL_TAIL, log_f_inv_tbl_tail); -DECLARE_TABLE(double, POWLOG_TBL_HEAD, 258) = { +__CLC_DECLARE_TABLE(double, POWLOG_TBL_HEAD, 258) = { 0x0.0000000000000p+0, 0x1.ff00aa0000000p-9, 0x1.fe02a60000000p-8, 0x1.7dc4750000000p-7, 0x1.fc0a8b0000000p-7, 0x1.3cea440000000p-6, 0x1.7b91b00000000p-6, 0x1.b9fc020000000p-6, 0x1.f829b00000000p-6, @@ -1145,9 +1143,9 @@ DECLARE_TABLE(double, POWLOG_TBL_HEAD, 258) = { 0x1.61e3ef0000000p-1, 0x1.62e42e0000000p-1, 0x0.0000000000000p+0, }; -CLC_TABLE_FUNCTION(double, POWLOG_TBL_HEAD, powlog_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, POWLOG_TBL_HEAD, powlog_tbl_head); -DECLARE_TABLE(double, POWLOG_TBL_TAIL, 258) = { +__CLC_DECLARE_TABLE(double, POWLOG_TBL_TAIL, 258) = { 0x0.0000000000000p+0, 0x1.5885e0250435ap-36, 0x1.620cf11f86ed2p-33, 0x1.f0214edba4a25p-32, 0x1.f807c79f3db4ep-36, 0x1.a352ba779a52bp-33, 0x1.f56c46aa49fd5p-32, 0x1.ebe465fef5196p-32, 0x1.cf0660099f1f8p-31, @@ -1236,9 +1234,9 @@ DECLARE_TABLE(double, POWLOG_TBL_TAIL, 258) = { 0x1.b48c8cd2f246cp-26, 0x1.efa39ef35793cp-25, 0x0.0000000000000p+0, }; -CLC_TABLE_FUNCTION(double, POWLOG_TBL_TAIL, powlog_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, POWLOG_TBL_TAIL, powlog_tbl_tail); -DECLARE_TABLE(double, TWO_TO_JBY64_EP_HEAD, 64) = { +__CLC_DECLARE_TABLE(double, TWO_TO_JBY64_EP_HEAD, 64) = { 0x1.0000000000000p+0, 0x1.02c9a30000000p+0, 0x1.059b0d0000000p+0, 0x1.0874510000000p+0, 0x1.0b55860000000p+0, 0x1.0e3ec30000000p+0, 0x1.11301d0000000p+0, 0x1.1429aa0000000p+0, 0x1.172b830000000p+0, @@ -1263,9 +1261,10 @@ DECLARE_TABLE(double, TWO_TO_JBY64_EP_HEAD, 64) = { 0x1.fa7c180000000p+0, }; -CLC_TABLE_FUNCTION(double, TWO_TO_JBY64_EP_HEAD, two_to_jby64_ep_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, TWO_TO_JBY64_EP_HEAD, + two_to_jby64_ep_tbl_head); -DECLARE_TABLE(double, TWO_TO_JBY64_EP_TAIL, 64) = { +__CLC_DECLARE_TABLE(double, TWO_TO_JBY64_EP_TAIL, 64) = { 0x0.0000000000000p+0, 0x1.cef00c1dcdef9p-25, 0x1.8ac2ba1d73e2ap-27, 0x1.0eb37901186bep-25, 0x1.9f3121ec53172p-25, 0x1.69e8d10103a17p-27, 0x1.25b50a4ebbf1ap-32, 0x1.d525bbf668203p-25, 0x1.8faa2f5b9bef9p-25, @@ -1290,9 +1289,10 @@ DECLARE_TABLE(double, TWO_TO_JBY64_EP_TAIL, 64) = { 0x1.9e90d82e90a7ep-28, }; -CLC_TABLE_FUNCTION(double, TWO_TO_JBY64_EP_TAIL, two_to_jby64_ep_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, TWO_TO_JBY64_EP_TAIL, + two_to_jby64_ep_tbl_tail); -DECLARE_TABLE(double, CBRT_INV_TBL, 257) = { +__CLC_DECLARE_TABLE(double, CBRT_INV_TBL, 257) = { 0x1.0000000000000p+1, 0x1.fe01fe01fe020p+0, 0x1.fc07f01fc07f0p+0, 0x1.fa11caa01fa12p+0, 0x1.f81f81f81f820p+0, 0x1.f6310aca0dbb5p+0, 0x1.f44659e4a4271p+0, 0x1.f25f644230ab5p+0, 0x1.f07c1f07c1f08p+0, @@ -1381,9 +1381,9 @@ DECLARE_TABLE(double, CBRT_INV_TBL, 257) = { 0x1.0080402010080p+0, 0x1.0000000000000p+0, }; -CLC_TABLE_FUNCTION(double, CBRT_INV_TBL, cbrt_inv_tbl); +__CLC_TABLE_FUNCTION_VEC(double, CBRT_INV_TBL, cbrt_inv_tbl); -DECLARE_TABLE(double, CBRT_DBL_TBL_HEAD, 257) = { +__CLC_DECLARE_TABLE(double, CBRT_DBL_TBL_HEAD, 257) = { 0x1.0000000000000p+0, 0x1.0055380000000p+0, 0x1.00aa390000000p+0, 0x1.00ff010000000p+0, 0x1.0153920000000p+0, 0x1.01a7eb0000000p+0, 0x1.01fc0d0000000p+0, 0x1.024ff80000000p+0, 0x1.02a3ad0000000p+0, @@ -1472,9 +1472,9 @@ DECLARE_TABLE(double, CBRT_DBL_TBL_HEAD, 257) = { 0x1.4254640000000p+0, 0x1.428a2f0000000p+0, }; -CLC_TABLE_FUNCTION(double, CBRT_DBL_TBL_HEAD, cbrt_dbl_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, CBRT_DBL_TBL_HEAD, cbrt_dbl_tbl_head); -DECLARE_TABLE(double, CBRT_DBL_TBL_TAIL, 257) = { +__CLC_DECLARE_TABLE(double, CBRT_DBL_TBL_TAIL, 257) = { 0x0.0000000000000p+0, 0x1.e6a24c81e4294p-25, 0x1.8548511e3a785p-26, 0x1.4eb9336ec07f6p-25, 0x1.0ea64b8b750e1p-27, 0x1.61637cff8a53cp-27, 0x1.0733bf7bd1943p-27, 0x1.666911345ccedp-26, 0x1.77b7a3f592f14p-27, @@ -1563,23 +1563,23 @@ DECLARE_TABLE(double, CBRT_DBL_TBL_TAIL, 257) = { 0x1.dc43f1ed210b4p-25, 0x1.31ae515c447bbp-25, }; -CLC_TABLE_FUNCTION(double, CBRT_DBL_TBL_TAIL, cbrt_dbl_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, CBRT_DBL_TBL_TAIL, cbrt_dbl_tbl_tail); -DECLARE_TABLE(double, CBRT_REM_TBL_HEAD, 5) = { +__CLC_DECLARE_TABLE(double, CBRT_REM_TBL_HEAD, 5) = { 0x1.428a2f0000000p-1, 0x1.965fea0000000p-1, 0x1.0000000000000p+0, 0x1.428a2f0000000p+0, 0x1.965fea0000000p+0, }; -CLC_TABLE_FUNCTION(double, CBRT_REM_TBL_HEAD, cbrt_rem_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, CBRT_REM_TBL_HEAD, cbrt_rem_tbl_head); -DECLARE_TABLE(double, CBRT_REM_TBL_TAIL, 5) = { +__CLC_DECLARE_TABLE(double, CBRT_REM_TBL_TAIL, 5) = { 0x1.31ae515c447bbp-26, 0x1.4f5b8f20ac166p-27, 0x0.0000000000000p+0, 0x1.31ae515c447bbp-25, 0x1.4f5b8f20ac166p-26, }; -CLC_TABLE_FUNCTION(double, CBRT_REM_TBL_TAIL, cbrt_rem_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, CBRT_REM_TBL_TAIL, cbrt_rem_tbl_tail); -DECLARE_TABLE(double, SINH_TBL_HEAD, 37) = { +__CLC_DECLARE_TABLE(double, SINH_TBL_HEAD, 37) = { 0x0.0000000000000p+0, 0x1.2cd9fc0000000p+0, 0x1.d03cf60000000p+1, 0x1.40926e0000000p+3, 0x1.b4a3800000000p+4, 0x1.28d0160000000p+6, 0x1.936d228000000p+7, 0x1.1228768000000p+9, 0x1.749ea50000000p+10, @@ -1595,7 +1595,7 @@ DECLARE_TABLE(double, SINH_TBL_HEAD, 37) = { 0x1.ea215a0000000p+50, }; -DECLARE_TABLE(double, SINH_TBL_TAIL, 37) = { +__CLC_DECLARE_TABLE(double, SINH_TBL_TAIL, 37) = { 0x0.0000000000000p+0, 0x1.13ae6096a0092p-26, 0x1.db70cfb79a640p-26, 0x1.c2526b66dc067p-23, 0x1.b81b18647f380p-23, 0x1.bc1cdd1e1eb08p-20, 0x1.d9f201534fb09p-19, 0x1.d1c064a4e9954p-18, 0x1.4eca65d06ea74p-18, @@ -1611,7 +1611,7 @@ DECLARE_TABLE(double, SINH_TBL_TAIL, 37) = { 0x1.d20d76744835cp+22, }; -DECLARE_TABLE(double, COSH_TBL_HEAD, 37) = { +__CLC_DECLARE_TABLE(double, COSH_TBL_HEAD, 37) = { 0x1.0000000000000p+0, 0x1.8b07550000000p+0, 0x1.e18fa08000000p+1, 0x1.422a490000000p+3, 0x1.b4ee858000000p+4, 0x1.28d6fc8000000p+6, 0x1.936e678000000p+7, 0x1.1228948000000p+9, 0x1.749eaa8000000p+10, @@ -1627,7 +1627,7 @@ DECLARE_TABLE(double, COSH_TBL_HEAD, 37) = { 0x1.ea215a0000000p+50, }; -DECLARE_TABLE(double, COSH_TBL_TAIL, 37) = { +__CLC_DECLARE_TABLE(double, COSH_TBL_TAIL, 37) = { 0x0.0000000000000p+0, 0x1.d9f5504c2bd28p-28, 0x1.7cb66f0a4c9fdp-25, 0x1.f58617928e588p-23, 0x1.bc7d000c38d48p-25, 0x1.f7f9d4e329998p-21, 0x1.6e6e464885269p-19, 0x1.ba3a8b946c154p-19, 0x1.3f4e76110d5a4p-18, @@ -1643,12 +1643,12 @@ DECLARE_TABLE(double, COSH_TBL_TAIL, 37) = { 0x1.d20d76744835cp+22, }; -CLC_TABLE_FUNCTION(double, SINH_TBL_HEAD, sinh_tbl_head); -CLC_TABLE_FUNCTION(double, SINH_TBL_TAIL, sinh_tbl_tail); -CLC_TABLE_FUNCTION(double, COSH_TBL_HEAD, cosh_tbl_head); -CLC_TABLE_FUNCTION(double, COSH_TBL_TAIL, cosh_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, SINH_TBL_HEAD, sinh_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, SINH_TBL_TAIL, sinh_tbl_tail); +__CLC_TABLE_FUNCTION_VEC(double, COSH_TBL_HEAD, cosh_tbl_head); +__CLC_TABLE_FUNCTION_VEC(double, COSH_TBL_TAIL, cosh_tbl_tail); -DECLARE_TABLE(uchar, PIBITS_TBL, ) = { +__CLC_DECLARE_TABLE(uchar, PIBITS_TBL, ) = { 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, 44, 159, 60, 251, 217, @@ -1661,25 +1661,25 @@ DECLARE_TABLE(uchar, PIBITS_TBL, ) = { 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, 230, 139, 2, 0, 0, 0, 0, 0, 0, 0}; -_CLC_DEF _CLC_OVERLOAD ulong TABLE_MANGLE(pibits_tbl)(int idx) { +_CLC_DEF _CLC_OVERLOAD ulong __CLC_TABLE_MANGLE(pibits_tbl)(int idx) { return *(__constant ulong *)(PIBITS_TBL + idx); } -_CLC_DEF _CLC_OVERLOAD ulong2 TABLE_MANGLE(pibits_tbl)(int2 idx) { +_CLC_DEF _CLC_OVERLOAD ulong2 __CLC_TABLE_MANGLE(pibits_tbl)(int2 idx) { return (ulong2){*(__constant ulong *)(PIBITS_TBL + idx.s0), *(__constant ulong *)(PIBITS_TBL + idx.s1)}; } -_CLC_DEF _CLC_OVERLOAD ulong3 TABLE_MANGLE(pibits_tbl)(int3 idx) { +_CLC_DEF _CLC_OVERLOAD ulong3 __CLC_TABLE_MANGLE(pibits_tbl)(int3 idx) { return (ulong3){*(__constant ulong *)(PIBITS_TBL + idx.s0), *(__constant ulong *)(PIBITS_TBL + idx.s1), *(__constant ulong *)(PIBITS_TBL + idx.s2)}; } -_CLC_DEF _CLC_OVERLOAD ulong4 TABLE_MANGLE(pibits_tbl)(int4 idx) { +_CLC_DEF _CLC_OVERLOAD ulong4 __CLC_TABLE_MANGLE(pibits_tbl)(int4 idx) { return (ulong4){*(__constant ulong *)(PIBITS_TBL + idx.s0), *(__constant ulong *)(PIBITS_TBL + idx.s1), *(__constant ulong *)(PIBITS_TBL + idx.s2), *(__constant ulong *)(PIBITS_TBL + idx.s3)}; } -_CLC_DEF _CLC_OVERLOAD ulong8 TABLE_MANGLE(pibits_tbl)(int8 idx) { +_CLC_DEF _CLC_OVERLOAD ulong8 __CLC_TABLE_MANGLE(pibits_tbl)(int8 idx) { return (ulong8){*(__constant ulong *)(PIBITS_TBL + idx.s0), *(__constant ulong *)(PIBITS_TBL + idx.s1), *(__constant ulong *)(PIBITS_TBL + idx.s2), @@ -1689,7 +1689,7 @@ _CLC_DEF _CLC_OVERLOAD ulong8 TABLE_MANGLE(pibits_tbl)(int8 idx) { *(__constant ulong *)(PIBITS_TBL + idx.s6), *(__constant ulong *)(PIBITS_TBL + idx.s7)}; } -_CLC_DEF _CLC_OVERLOAD ulong16 TABLE_MANGLE(pibits_tbl)(int16 idx) { +_CLC_DEF _CLC_OVERLOAD ulong16 __CLC_TABLE_MANGLE(pibits_tbl)(int16 idx) { return (ulong16){*(__constant ulong *)(PIBITS_TBL + idx.s0), *(__constant ulong *)(PIBITS_TBL + idx.s1), *(__constant ulong *)(PIBITS_TBL + idx.s2), diff --git a/libclc/clc/lib/generic/math/clc_tgamma.cl b/libclc/clc/lib/generic/math/clc_tgamma.cl index 83b09cc33ecc..c379cd16b5a6 100644 --- a/libclc/clc/lib/generic/math/clc_tgamma.cl +++ b/libclc/clc/lib/generic/math/clc_tgamma.cl @@ -65,6 +65,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_tgamma(half x) { #endif -#define FUNCTION __clc_tgamma +#define __CLC_FUNCTION __clc_tgamma #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/math/clc_trunc.cl b/libclc/clc/lib/generic/math/clc_trunc.cl index 6261d04cd26d..d9c54a1888d9 100644 --- a/libclc/clc/lib/generic/math/clc_trunc.cl +++ b/libclc/clc/lib/generic/math/clc_trunc.cl @@ -8,8 +8,8 @@ #include -#define FUNCTION __clc_trunc -#define __IMPL_FUNCTION(x) __builtin_elementwise_trunc +#define __CLC_FUNCTION __clc_trunc +#define __CLC_IMPL_FUNCTION(x) __builtin_elementwise_trunc #define __CLC_BODY #include diff --git a/libclc/clc/lib/generic/relational/clc_bitselect.cl b/libclc/clc/lib/generic/relational/clc_bitselect.cl index 2976b8947e54..9c6e82e6be4b 100644 --- a/libclc/clc/lib/generic/relational/clc_bitselect.cl +++ b/libclc/clc/lib/generic/relational/clc_bitselect.cl @@ -12,7 +12,7 @@ #define __CLC_BODY #include -#define FLOAT_BITSELECT(f_type, i_type, width) \ +#define __CLC_FLOAT_BITSELECT(f_type, i_type, width) \ _CLC_OVERLOAD _CLC_DEF f_type##width __clc_bitselect( \ f_type##width x, f_type##width y, f_type##width z) { \ return __clc_as_##f_type##width(__clc_bitselect( \ @@ -20,33 +20,33 @@ __clc_as_##i_type##width(z))); \ } -FLOAT_BITSELECT(float, uint, ) -FLOAT_BITSELECT(float, uint, 2) -FLOAT_BITSELECT(float, uint, 3) -FLOAT_BITSELECT(float, uint, 4) -FLOAT_BITSELECT(float, uint, 8) -FLOAT_BITSELECT(float, uint, 16) +__CLC_FLOAT_BITSELECT(float, uint, ) +__CLC_FLOAT_BITSELECT(float, uint, 2) +__CLC_FLOAT_BITSELECT(float, uint, 3) +__CLC_FLOAT_BITSELECT(float, uint, 4) +__CLC_FLOAT_BITSELECT(float, uint, 8) +__CLC_FLOAT_BITSELECT(float, uint, 16) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -FLOAT_BITSELECT(double, ulong, ) -FLOAT_BITSELECT(double, ulong, 2) -FLOAT_BITSELECT(double, ulong, 3) -FLOAT_BITSELECT(double, ulong, 4) -FLOAT_BITSELECT(double, ulong, 8) -FLOAT_BITSELECT(double, ulong, 16) +__CLC_FLOAT_BITSELECT(double, ulong, ) +__CLC_FLOAT_BITSELECT(double, ulong, 2) +__CLC_FLOAT_BITSELECT(double, ulong, 3) +__CLC_FLOAT_BITSELECT(double, ulong, 4) +__CLC_FLOAT_BITSELECT(double, ulong, 8) +__CLC_FLOAT_BITSELECT(double, ulong, 16) #endif #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -FLOAT_BITSELECT(half, ushort, ) -FLOAT_BITSELECT(half, ushort, 2) -FLOAT_BITSELECT(half, ushort, 3) -FLOAT_BITSELECT(half, ushort, 4) -FLOAT_BITSELECT(half, ushort, 8) -FLOAT_BITSELECT(half, ushort, 16) +__CLC_FLOAT_BITSELECT(half, ushort, ) +__CLC_FLOAT_BITSELECT(half, ushort, 2) +__CLC_FLOAT_BITSELECT(half, ushort, 3) +__CLC_FLOAT_BITSELECT(half, ushort, 4) +__CLC_FLOAT_BITSELECT(half, ushort, 8) +__CLC_FLOAT_BITSELECT(half, ushort, 16) #endif diff --git a/libclc/clc/lib/generic/relational/clc_signbit.cl b/libclc/clc/lib/generic/relational/clc_signbit.cl index d8736a72b394..cb7000ddb075 100644 --- a/libclc/clc/lib/generic/relational/clc_signbit.cl +++ b/libclc/clc/lib/generic/relational/clc_signbit.cl @@ -9,50 +9,54 @@ #include #include -#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, FUNCTION, ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.lo), FUNCTION(x.hi)} != \ - (RET_TYPE)0); \ +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE, __CLC_FUNCTION, ARG_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)((RET_TYPE){__CLC_FUNCTION(x.lo), \ + __CLC_FUNCTION(x.hi)} != (RET_TYPE)0); \ } -#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, FUNCTION, ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), \ - FUNCTION(x.s2)} != (RET_TYPE)0); \ +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE, __CLC_FUNCTION, ARG_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)((RET_TYPE){__CLC_FUNCTION(x.s0), __CLC_FUNCTION(x.s1), \ + __CLC_FUNCTION(x.s2)} != (RET_TYPE)0); \ } -#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, FUNCTION, ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ - return (RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), \ - FUNCTION(x.s2), \ - FUNCTION(x.s3)} != (RET_TYPE)0); \ +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE, __CLC_FUNCTION, ARG_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)((RET_TYPE){__CLC_FUNCTION(x.s0), __CLC_FUNCTION(x.s1), \ + __CLC_FUNCTION(x.s2), \ + __CLC_FUNCTION(x.s3)} != (RET_TYPE)0); \ } -#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, FUNCTION, ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ - return ( \ - RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \ - FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5), \ - FUNCTION(x.s6), FUNCTION(x.s7)} != (RET_TYPE)0); \ +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE, __CLC_FUNCTION, ARG_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)((RET_TYPE){__CLC_FUNCTION(x.s0), __CLC_FUNCTION(x.s1), \ + __CLC_FUNCTION(x.s2), __CLC_FUNCTION(x.s3), \ + __CLC_FUNCTION(x.s4), __CLC_FUNCTION(x.s5), \ + __CLC_FUNCTION(x.s6), \ + __CLC_FUNCTION(x.s7)} != (RET_TYPE)0); \ } -#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, FUNCTION, ARG_TYPE) \ - _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG_TYPE x) { \ - return ( \ - RET_TYPE)((RET_TYPE){FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \ - FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5), \ - FUNCTION(x.s6), FUNCTION(x.s7), FUNCTION(x.s8), \ - FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \ - FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), \ - FUNCTION(x.sf)} != (RET_TYPE)0); \ +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE, __CLC_FUNCTION, ARG_TYPE) \ + _CLC_DEF _CLC_OVERLOAD RET_TYPE __CLC_FUNCTION(ARG_TYPE x) { \ + return (RET_TYPE)((RET_TYPE){__CLC_FUNCTION(x.s0), __CLC_FUNCTION(x.s1), \ + __CLC_FUNCTION(x.s2), __CLC_FUNCTION(x.s3), \ + __CLC_FUNCTION(x.s4), __CLC_FUNCTION(x.s5), \ + __CLC_FUNCTION(x.s6), __CLC_FUNCTION(x.s7), \ + __CLC_FUNCTION(x.s8), __CLC_FUNCTION(x.s9), \ + __CLC_FUNCTION(x.sa), __CLC_FUNCTION(x.sb), \ + __CLC_FUNCTION(x.sc), __CLC_FUNCTION(x.sd), \ + __CLC_FUNCTION(x.se), \ + __CLC_FUNCTION(x.sf)} != (RET_TYPE)0); \ } -#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, FUNCTION, ARG_TYPE) \ - _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, FUNCTION, ARG_TYPE##2) \ - _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, FUNCTION, ARG_TYPE##3) \ - _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, FUNCTION, ARG_TYPE##4) \ - _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, FUNCTION, ARG_TYPE##8) \ - _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, FUNCTION, ARG_TYPE##16) +#define _CLC_DEFINE_RELATIONAL_UNARY_VEC_ALL(RET_TYPE, __CLC_FUNCTION, \ + ARG_TYPE) \ + _CLC_DEFINE_RELATIONAL_UNARY_VEC2(RET_TYPE##2, __CLC_FUNCTION, ARG_TYPE##2) \ + _CLC_DEFINE_RELATIONAL_UNARY_VEC3(RET_TYPE##3, __CLC_FUNCTION, ARG_TYPE##3) \ + _CLC_DEFINE_RELATIONAL_UNARY_VEC4(RET_TYPE##4, __CLC_FUNCTION, ARG_TYPE##4) \ + _CLC_DEFINE_RELATIONAL_UNARY_VEC8(RET_TYPE##8, __CLC_FUNCTION, ARG_TYPE##8) \ + _CLC_DEFINE_RELATIONAL_UNARY_VEC16(RET_TYPE##16, __CLC_FUNCTION, ARG_TYPE##16) _CLC_DEF _CLC_OVERLOAD int __clc_signbit(float x) { return __builtin_signbitf(x); diff --git a/libclc/clc/lib/generic/shared/clc_vload.cl b/libclc/clc/lib/generic/shared/clc_vload.cl index e4003e4a9673..5942f10d2de5 100644 --- a/libclc/clc/lib/generic/shared/clc_vload.cl +++ b/libclc/clc/lib/generic/shared/clc_vload.cl @@ -9,7 +9,7 @@ #include #include -#define VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ +#define __CLC_VLOAD_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ _CLC_OVERLOAD _CLC_DEF PRIM_TYPE##2 __clc_vload2( \ size_t offset, const ADDR_SPACE PRIM_TYPE *x) { \ return *( \ @@ -42,89 +42,89 @@ } #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define VLOAD_VECTORIZE_GENERIC VLOAD_VECTORIZE +#define __CLC_VLOAD_VECTORIZE_GENERIC __CLC_VLOAD_VECTORIZE #else // The generic address space isn't available, so make the macro do nothing -#define VLOAD_VECTORIZE_GENERIC(X, Y) +#define __CLC_VLOAD_VECTORIZE_GENERIC(X, Y) #endif -#define VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ - VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ - VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic) +#define __CLC_VLOAD_ADDR_SPACES(__CLC_SCALAR_GENTYPE) \ + __CLC_VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __private) \ + __CLC_VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __local) \ + __CLC_VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __constant) \ + __CLC_VLOAD_VECTORIZE(__CLC_SCALAR_GENTYPE, __global) \ + __CLC_VLOAD_VECTORIZE_GENERIC(__CLC_SCALAR_GENTYPE, __generic) -#define VLOAD_TYPES() \ - VLOAD_ADDR_SPACES(char) \ - VLOAD_ADDR_SPACES(uchar) \ - VLOAD_ADDR_SPACES(short) \ - VLOAD_ADDR_SPACES(ushort) \ - VLOAD_ADDR_SPACES(int) \ - VLOAD_ADDR_SPACES(uint) \ - VLOAD_ADDR_SPACES(long) \ - VLOAD_ADDR_SPACES(ulong) \ - VLOAD_ADDR_SPACES(float) +#define __CLC_VLOAD_TYPES() \ + __CLC_VLOAD_ADDR_SPACES(char) \ + __CLC_VLOAD_ADDR_SPACES(uchar) \ + __CLC_VLOAD_ADDR_SPACES(short) \ + __CLC_VLOAD_ADDR_SPACES(ushort) \ + __CLC_VLOAD_ADDR_SPACES(int) \ + __CLC_VLOAD_ADDR_SPACES(uint) \ + __CLC_VLOAD_ADDR_SPACES(long) \ + __CLC_VLOAD_ADDR_SPACES(ulong) \ + __CLC_VLOAD_ADDR_SPACES(float) -VLOAD_TYPES() +__CLC_VLOAD_TYPES() #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -VLOAD_ADDR_SPACES(double) +__CLC_VLOAD_ADDR_SPACES(double) #endif #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -VLOAD_ADDR_SPACES(half) +__CLC_VLOAD_ADDR_SPACES(half) #endif /* vload_half are legal even without cl_khr_fp16 */ /* no vload_half for double */ -#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); -#define VEC_LOAD2(val, AS) \ - VEC_LOAD1(val.lo, AS) \ - VEC_LOAD1(val.hi, AS) -#define VEC_LOAD3(val, AS) \ - VEC_LOAD1(val.s0, AS) \ - VEC_LOAD1(val.s1, AS) \ - VEC_LOAD1(val.s2, AS) -#define VEC_LOAD4(val, AS) \ - VEC_LOAD2(val.lo, AS) \ - VEC_LOAD2(val.hi, AS) -#define VEC_LOAD8(val, AS) \ - VEC_LOAD4(val.lo, AS) \ - VEC_LOAD4(val.hi, AS) -#define VEC_LOAD16(val, AS) \ - VEC_LOAD8(val.lo, AS) \ - VEC_LOAD8(val.hi, AS) +#define __CLC_VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]); +#define __CLC_VEC_LOAD2(val, AS) \ + __CLC_VEC_LOAD1(val.lo, AS) \ + __CLC_VEC_LOAD1(val.hi, AS) +#define __CLC_VEC_LOAD3(val, AS) \ + __CLC_VEC_LOAD1(val.s0, AS) \ + __CLC_VEC_LOAD1(val.s1, AS) \ + __CLC_VEC_LOAD1(val.s2, AS) +#define __CLC_VEC_LOAD4(val, AS) \ + __CLC_VEC_LOAD2(val.lo, AS) \ + __CLC_VEC_LOAD2(val.hi, AS) +#define __CLC_VEC_LOAD8(val, AS) \ + __CLC_VEC_LOAD4(val.lo, AS) \ + __CLC_VEC_LOAD4(val.hi, AS) +#define __CLC_VEC_LOAD16(val, AS) \ + __CLC_VEC_LOAD8(val.lo, AS) \ + __CLC_VEC_LOAD8(val.hi, AS) -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ +#define __CLC_FUNC_IMPL(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE __clc_vload_half##SUFFIX(size_t offset, \ const AS half *mem) { \ offset *= VEC_SIZE; \ TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + __CLC_VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ } \ _CLC_OVERLOAD _CLC_DEF TYPE __clc_vloada_half##SUFFIX(size_t offset, \ const AS half *mem) { \ offset *= OFFSET_SIZE; \ TYPE __tmp; \ - VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ + __CLC_VEC_LOAD##VEC_SIZE(__tmp, AS) return __tmp; \ } -#define FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) +#define __CLC_FUNC(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) \ + __CLC_FUNC_IMPL(SUFFIX, VEC_SIZE, OFFSET_SIZE, TYPE, AS) #define __CLC_BODY "clc_vload_half.inc" #include -#undef FUNC -#undef __FUNC -#undef VEC_LOAD16 -#undef VEC_LOAD8 -#undef VEC_LOAD4 -#undef VEC_LOAD3 -#undef VEC_LOAD2 -#undef VEC_LOAD1 -#undef VLOAD_TYPES -#undef VLOAD_ADDR_SPACES -#undef VLOAD_VECTORIZE -#undef VLOAD_VECTORIZE_GENERIC +#undef __CLC_FUNC +#undef __CLC_FUNC_IMPL +#undef __CLC_VEC_LOAD16 +#undef __CLC_VEC_LOAD8 +#undef __CLC_VEC_LOAD4 +#undef __CLC_VEC_LOAD3 +#undef __CLC_VEC_LOAD2 +#undef __CLC_VEC_LOAD1 +#undef __CLC_VLOAD_TYPES +#undef __CLC_VLOAD_ADDR_SPACES +#undef __CLC_VLOAD_VECTORIZE +#undef __CLC_VLOAD_VECTORIZE_GENERIC diff --git a/libclc/clc/lib/generic/shared/clc_vload_half.inc b/libclc/clc/lib/generic/shared/clc_vload_half.inc index aaf067d75daa..2d5fdb357b0a 100644 --- a/libclc/clc/lib/generic/shared/clc_vload_half.inc +++ b/libclc/clc/lib/generic/shared/clc_vload_half.inc @@ -16,22 +16,25 @@ #define __CLC_OFFSET __CLC_VECSIZE #endif -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __constant); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, + __private); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, + __constant); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __generic); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, + __generic); #endif #undef __CLC_OFFSET #else -FUNC(, 1, 1, __CLC_GENTYPE, __private); -FUNC(, 1, 1, __CLC_GENTYPE, __local); -FUNC(, 1, 1, __CLC_GENTYPE, __global); -FUNC(, 1, 1, __CLC_GENTYPE, __constant); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __private); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __local); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __global); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __constant); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FUNC(, 1, 1, __CLC_GENTYPE, __generic); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __generic); #endif #endif #endif diff --git a/libclc/clc/lib/generic/shared/clc_vstore.cl b/libclc/clc/lib/generic/shared/clc_vstore.cl index adde58aec915..ac9692739512 100644 --- a/libclc/clc/lib/generic/shared/clc_vstore.cl +++ b/libclc/clc/lib/generic/shared/clc_vstore.cl @@ -17,7 +17,7 @@ #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable -#define VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ +#define __CLC_VSTORE_VECTORIZE(PRIM_TYPE, ADDR_SPACE) \ typedef PRIM_TYPE##2 less_aligned_##ADDR_SPACE##PRIM_TYPE##2 \ __attribute__((aligned(sizeof(PRIM_TYPE)))); \ _CLC_OVERLOAD _CLC_DEF void __clc_vstore2(PRIM_TYPE##2 vec, size_t offset, \ @@ -58,67 +58,68 @@ } #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -#define VSTORE_VECTORIZE_GENERIC VSTORE_VECTORIZE +#define __CLC_VSTORE_VECTORIZE_GENERIC __CLC_VSTORE_VECTORIZE #else // The generic address space isn't available, so make the macro do nothing -#define VSTORE_VECTORIZE_GENERIC(X, Y) +#define __CLC_VSTORE_VECTORIZE_GENERIC(X, Y) #endif -#define VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ - VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ - VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) +#define __CLC_VSTORE_ADDR_SPACES(__CLC_SCALAR___CLC_GENTYPE) \ + __CLC_VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __private) \ + __CLC_VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __local) \ + __CLC_VSTORE_VECTORIZE(__CLC_SCALAR___CLC_GENTYPE, __global) \ + __CLC_VSTORE_VECTORIZE_GENERIC(__CLC_SCALAR___CLC_GENTYPE, __generic) -VSTORE_ADDR_SPACES(char) -VSTORE_ADDR_SPACES(uchar) -VSTORE_ADDR_SPACES(short) -VSTORE_ADDR_SPACES(ushort) -VSTORE_ADDR_SPACES(int) -VSTORE_ADDR_SPACES(uint) -VSTORE_ADDR_SPACES(long) -VSTORE_ADDR_SPACES(ulong) -VSTORE_ADDR_SPACES(float) +__CLC_VSTORE_ADDR_SPACES(char) +__CLC_VSTORE_ADDR_SPACES(uchar) +__CLC_VSTORE_ADDR_SPACES(short) +__CLC_VSTORE_ADDR_SPACES(ushort) +__CLC_VSTORE_ADDR_SPACES(int) +__CLC_VSTORE_ADDR_SPACES(uint) +__CLC_VSTORE_ADDR_SPACES(long) +__CLC_VSTORE_ADDR_SPACES(ulong) +__CLC_VSTORE_ADDR_SPACES(float) #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -VSTORE_ADDR_SPACES(double) +__CLC_VSTORE_ADDR_SPACES(double) #endif #ifdef cl_khr_fp16 #pragma OPENCL EXTENSION cl_khr_fp16 : enable -VSTORE_ADDR_SPACES(half) +__CLC_VSTORE_ADDR_SPACES(half) #endif -#define VEC_STORE1(val, ROUNDF, BUILTIN) BUILTIN(ROUNDF(val), &mem[offset++]); +#define __CLC_VEC_STORE1(val, ROUNDF, BUILTIN) \ + BUILTIN(ROUNDF(val), &mem[offset++]); -#define VEC_STORE2(val, ROUNDF, BUILTIN) \ - VEC_STORE1(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE1(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE3(val, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s0, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s1, ROUNDF, BUILTIN) \ - VEC_STORE1(val.s2, ROUNDF, BUILTIN) -#define VEC_STORE4(val, ROUNDF, BUILTIN) \ - VEC_STORE2(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE2(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE8(val, ROUNDF, BUILTIN) \ - VEC_STORE4(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE4(val.hi, ROUNDF, BUILTIN) -#define VEC_STORE16(val, ROUNDF, BUILTIN) \ - VEC_STORE8(val.lo, ROUNDF, BUILTIN) \ - VEC_STORE8(val.hi, ROUNDF, BUILTIN) +#define __CLC_VEC_STORE2(val, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE1(val.lo, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE1(val.hi, ROUNDF, BUILTIN) +#define __CLC_VEC_STORE3(val, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE1(val.s0, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE1(val.s1, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE1(val.s2, ROUNDF, BUILTIN) +#define __CLC_VEC_STORE4(val, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE2(val.lo, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE2(val.hi, ROUNDF, BUILTIN) +#define __CLC_VEC_STORE8(val, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE4(val.lo, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE4(val.hi, ROUNDF, BUILTIN) +#define __CLC_VEC_STORE16(val, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE8(val.lo, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE8(val.hi, ROUNDF, BUILTIN) -#define __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN) \ +#define __CLC_XFUNC_IMPL(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, ROUNDF, BUILTIN) \ _CLC_OVERLOAD _CLC_DEF void __clc_vstore_half##SUFFIX( \ TYPE vec, size_t offset, AS half *mem) { \ offset *= VEC_SIZE; \ - VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ } \ _CLC_OVERLOAD _CLC_DEF void __clc_vstorea_half##SUFFIX( \ TYPE vec, size_t offset, AS half *mem) { \ offset *= OFFSET; \ - VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ + __CLC_VEC_STORE##VEC_SIZE(vec, ROUNDF, BUILTIN) \ } _CLC_DEF _CLC_OVERLOAD float __clc_noop(float x) { return x; } @@ -241,28 +242,30 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rte(double x) { } #endif -#define __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ - __FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN) \ - __FUNC(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, BUILTIN) \ - __FUNC(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, BUILTIN) \ - __FUNC(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, BUILTIN) \ - __FUNC(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN) +#define __CLC_XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ + __CLC_XFUNC_IMPL(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, __clc_noop, BUILTIN) \ + __CLC_XFUNC_IMPL(SUFFIX##_rtz, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtz, \ + BUILTIN) \ + __CLC_XFUNC_IMPL(SUFFIX##_rtn, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtn, \ + BUILTIN) \ + __CLC_XFUNC_IMPL(SUFFIX##_rtp, VEC_SIZE, OFFSET, TYPE, AS, __clc_rtp, \ + BUILTIN) \ + __CLC_XFUNC_IMPL(SUFFIX##_rte, VEC_SIZE, OFFSET, TYPE, AS, __clc_rte, BUILTIN) -#define FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ - __XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) +#define __CLC_FUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) \ + __CLC_XFUNC(SUFFIX, VEC_SIZE, OFFSET, TYPE, AS, BUILTIN) #define __CLC_BODY "clc_vstore_half.inc" #include -#undef FUNC -#undef __XFUNC -#undef __FUNC -#undef VEC_LOAD16 -#undef VEC_LOAD8 -#undef VEC_LOAD4 -#undef VEC_LOAD3 -#undef VEC_LOAD2 -#undef VEC_LOAD1 -#undef DECLARE_HELPER -#undef VSTORE_ADDR_SPACES -#undef VSTORE_VECTORIZE -#undef VSTORE_VECTORIZE_GENERIC +#undef __CLC_FUNC +#undef __CLC_XFUNC +#undef __CLC_XFUNC_IMPL +#undef __CLC_VEC_STORE16 +#undef __CLC_VEC_STORE8 +#undef __CLC_VEC_STORE4 +#undef __CLC_VEC_STORE3 +#undef __CLC_VEC_STORE2 +#undef __CLC_VEC_STORE1 +#undef __CLC_VSTORE_ADDR_SPACES +#undef __CLC_VSTORE_VECTORIZE +#undef __CLC_VSTORE_VECTORIZE_GENERIC diff --git a/libclc/clc/lib/generic/shared/clc_vstore_half.inc b/libclc/clc/lib/generic/shared/clc_vstore_half.inc index 02d645357f85..9188f96e2969 100644 --- a/libclc/clc/lib/generic/shared/clc_vstore_half.inc +++ b/libclc/clc/lib/generic/shared/clc_vstore_half.inc @@ -25,24 +25,24 @@ #define __CLC_OFFSET __CLC_VECSIZE #endif -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private, - STORE_HALF_BUILTIN); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local, - STORE_HALF_BUILTIN); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global, - STORE_HALF_BUILTIN); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __private, + STORE_HALF_BUILTIN); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __local, + STORE_HALF_BUILTIN); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __global, + STORE_HALF_BUILTIN); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __generic, - STORE_HALF_BUILTIN); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_OFFSET, __CLC_GENTYPE, __generic, + STORE_HALF_BUILTIN); #endif #undef __CLC_OFFSET #else -FUNC(, 1, 1, __CLC_GENTYPE, __private, STORE_HALF_BUILTIN); -FUNC(, 1, 1, __CLC_GENTYPE, __local, STORE_HALF_BUILTIN); -FUNC(, 1, 1, __CLC_GENTYPE, __global, STORE_HALF_BUILTIN); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __private, STORE_HALF_BUILTIN); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __local, STORE_HALF_BUILTIN); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __global, STORE_HALF_BUILTIN); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FUNC(, 1, 1, __CLC_GENTYPE, __generic, STORE_HALF_BUILTIN); +__CLC_FUNC(, 1, 1, __CLC_GENTYPE, __generic, STORE_HALF_BUILTIN); #endif #endif diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_log.cl b/libclc/clc/lib/ptx-nvidiacl/math/clc_log.cl index 9c2778bfd1a7..4ccdfaf4e3fc 100644 --- a/libclc/clc/lib/ptx-nvidiacl/math/clc_log.cl +++ b/libclc/clc/lib/ptx-nvidiacl/math/clc_log.cl @@ -29,6 +29,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_log(half x) { #endif -#define FUNCTION __clc_log +#define __CLC_FUNCTION __clc_log #define __CLC_BODY #include diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_rsqrt.cl b/libclc/clc/lib/ptx-nvidiacl/math/clc_rsqrt.cl index beea123180c5..786d57046728 100644 --- a/libclc/clc/lib/ptx-nvidiacl/math/clc_rsqrt.cl +++ b/libclc/clc/lib/ptx-nvidiacl/math/clc_rsqrt.cl @@ -29,6 +29,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_rsqrt(half x) { #endif -#define FUNCTION __clc_rsqrt +#define __CLC_FUNCTION __clc_rsqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_sinpi.cl b/libclc/clc/lib/ptx-nvidiacl/math/clc_sinpi.cl index 40903ba58b84..78f7038b2644 100644 --- a/libclc/clc/lib/ptx-nvidiacl/math/clc_sinpi.cl +++ b/libclc/clc/lib/ptx-nvidiacl/math/clc_sinpi.cl @@ -29,6 +29,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_sinpi(half x) { #endif -#define FUNCTION __clc_sinpi +#define __CLC_FUNCTION __clc_sinpi #define __CLC_BODY #include diff --git a/libclc/clc/lib/ptx-nvidiacl/math/clc_sqrt.cl b/libclc/clc/lib/ptx-nvidiacl/math/clc_sqrt.cl index ea91894b9dcf..ac86a57fca3d 100644 --- a/libclc/clc/lib/ptx-nvidiacl/math/clc_sqrt.cl +++ b/libclc/clc/lib/ptx-nvidiacl/math/clc_sqrt.cl @@ -29,6 +29,6 @@ _CLC_OVERLOAD _CLC_DEF half __clc_sqrt(half x) { #endif -#define FUNCTION __clc_sqrt +#define __CLC_FUNCTION __clc_sqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/ptx-nvidiacl/relational/clc_isinf.cl b/libclc/clc/lib/ptx-nvidiacl/relational/clc_isinf.cl index d518efad6a04..93f14d79fe6b 100644 --- a/libclc/clc/lib/ptx-nvidiacl/relational/clc_isinf.cl +++ b/libclc/clc/lib/ptx-nvidiacl/relational/clc_isinf.cl @@ -27,7 +27,7 @@ _CLC_OVERLOAD _CLC_DEF int __clc_isinf(half x) { return __clc_isinf((float)x); } #endif -#define FUNCTION __clc_isinf +#define __CLC_FUNCTION __clc_isinf #define __CLC_BODY #define __CLC_RET_TYPE __CLC_BIT_INT #include diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_size.cl b/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_size.cl index 4525c85f1e38..2d547d3af249 100644 --- a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_size.cl +++ b/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_local_size.cl @@ -17,6 +17,6 @@ _CLC_OVERLOAD _CLC_DEF size_t __clc_get_local_size(uint dim) { case 2: return __nvvm_read_ptx_sreg_ntid_z(); default: - return 0; + return 1; } } diff --git a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_num_groups.cl b/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_num_groups.cl index 495864751ef6..87151ea4be62 100644 --- a/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_num_groups.cl +++ b/libclc/clc/lib/ptx-nvidiacl/workitem/clc_get_num_groups.cl @@ -17,6 +17,6 @@ _CLC_OVERLOAD _CLC_DEF size_t __clc_get_num_groups(uint dim) { case 2: return __nvvm_read_ptx_sreg_nctaid_z(); default: - return 0; + return 1; } } diff --git a/libclc/clc/lib/r600/math/clc_native_rsqrt.cl b/libclc/clc/lib/r600/math/clc_native_rsqrt.cl index b5966570804c..da001574afc0 100644 --- a/libclc/clc/lib/r600/math/clc_native_rsqrt.cl +++ b/libclc/clc/lib/r600/math/clc_native_rsqrt.cl @@ -13,7 +13,7 @@ _CLC_OVERLOAD _CLC_DEF float __clc_native_rsqrt(float x) { return __builtin_r600_recipsqrt_ieeef(x); } -#define __FLOAT_ONLY -#define FUNCTION __clc_native_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_native_rsqrt #define __CLC_BODY #include diff --git a/libclc/clc/lib/r600/math/clc_rsqrt_override.cl b/libclc/clc/lib/r600/math/clc_rsqrt_override.cl index 75355df56d32..f20046b34117 100644 --- a/libclc/clc/lib/r600/math/clc_rsqrt_override.cl +++ b/libclc/clc/lib/r600/math/clc_rsqrt_override.cl @@ -13,11 +13,11 @@ _CLC_OVERLOAD _CLC_DEF float __clc_rsqrt(float x) { return __builtin_r600_recipsqrt_ieeef(x); } -#define __FLOAT_ONLY -#define FUNCTION __clc_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION __clc_rsqrt #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #ifdef cl_khr_fp64 @@ -27,10 +27,10 @@ _CLC_OVERLOAD _CLC_DEF double __clc_rsqrt(double x) { return __builtin_r600_recipsqrt_ieee(x); } -#define __DOUBLE_ONLY -#define FUNCTION __clc_rsqrt +#define __CLC_DOUBLE_ONLY +#define __CLC_FUNCTION __clc_rsqrt #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif diff --git a/libclc/clc/lib/spirv/math/clc_fmax.cl b/libclc/clc/lib/spirv/math/clc_fmax.cl index be660fe2b29b..f63a9d00c642 100644 --- a/libclc/clc/lib/spirv/math/clc_fmax.cl +++ b/libclc/clc/lib/spirv/math/clc_fmax.cl @@ -27,6 +27,6 @@ _CLC_DEF _CLC_OVERLOAD half __clc_fmax(half x, half y) { } #endif -#define FUNCTION __clc_fmax +#define __CLC_FUNCTION __clc_fmax #define __CLC_BODY #include diff --git a/libclc/clc/lib/spirv/math/clc_fmin.cl b/libclc/clc/lib/spirv/math/clc_fmin.cl index 9f3fa66c0ab7..ddb0126035f8 100644 --- a/libclc/clc/lib/spirv/math/clc_fmin.cl +++ b/libclc/clc/lib/spirv/math/clc_fmin.cl @@ -27,6 +27,6 @@ _CLC_DEF _CLC_OVERLOAD half __clc_fmin(half x, half y) { } #endif -#define FUNCTION __clc_fmin +#define __CLC_FUNCTION __clc_fmin #define __CLC_BODY #include diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_add.h b/libclc/opencl/include/clc/opencl/atomic/atom_add.h index 7f68fca0fc3a..b26a2a5361b6 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_add.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_add.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_base_atomics -#define FUNCTION atom_add +#define __CLC_FUNCTION atom_add #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -#define FUNCTION atom_add +#define __CLC_FUNCTION atom_add #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_base_atomics #ifdef cl_khr_int64_base_atomics -#define FUNCTION atom_add +#define __CLC_FUNCTION atom_add #include #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_and.h b/libclc/opencl/include/clc/opencl/atomic/atom_and.h index 5c40aa4db70a..eacbd7a33b6e 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_and.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_and.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_extended_atomics -#define FUNCTION atom_and +#define __CLC_FUNCTION atom_and #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define FUNCTION atom_and +#define __CLC_FUNCTION atom_and #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define FUNCTION atom_and +#define __CLC_FUNCTION atom_and #include #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc index b8631ceac3a4..866d8903db81 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int32.inc @@ -10,12 +10,12 @@ #include #define __CLC_DECLARE_ATOM(ADDRSPACE, TYPE) \ - _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE); + _CLC_OVERLOAD _CLC_DECL TYPE __CLC_FUNCTION(volatile ADDRSPACE TYPE *, TYPE); __CLC_DECLARE_ATOM(__CLC_ADDRESS_SPACE, int) __CLC_DECLARE_ATOM(__CLC_ADDRESS_SPACE, uint) #undef __CLC_DECLARE_ATOM -#undef FUNCTION +#undef __CLC_FUNCTION #undef __CLC_ADDRESS_SPACE diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc index 3918a7cae006..146de3412fc2 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atom_decl_int64.inc @@ -10,7 +10,7 @@ #include #define __CLC_DECLARE_ATOM(ADDRSPACE, TYPE) \ - _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE); + _CLC_OVERLOAD _CLC_DECL TYPE __CLC_FUNCTION(volatile ADDRSPACE TYPE *, TYPE); __CLC_DECLARE_ATOM(local, long) __CLC_DECLARE_ATOM(local, ulong) @@ -19,4 +19,4 @@ __CLC_DECLARE_ATOM(global, ulong) #undef __CLC_DECLARE_ATOM -#undef FUNCTION +#undef __CLC_FUNCTION diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_max.h b/libclc/opencl/include/clc/opencl/atomic/atom_max.h index 87c70f20058d..35a1e2aa06d4 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_max.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_max.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_extended_atomics -#define FUNCTION atom_max +#define __CLC_FUNCTION atom_max #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define FUNCTION atom_max +#define __CLC_FUNCTION atom_max #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define FUNCTION atom_max +#define __CLC_FUNCTION atom_max #include #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_min.h b/libclc/opencl/include/clc/opencl/atomic/atom_min.h index 487364c617ab..ded96a5b29a4 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_min.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_min.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_extended_atomics -#define FUNCTION atom_min +#define __CLC_FUNCTION atom_min #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define FUNCTION atom_min +#define __CLC_FUNCTION atom_min #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define FUNCTION atom_min +#define __CLC_FUNCTION atom_min #include #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_or.h b/libclc/opencl/include/clc/opencl/atomic/atom_or.h index 1c3ae6984eea..42a69bd3f7b4 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_or.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_or.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_extended_atomics -#define FUNCTION atom_or +#define __CLC_FUNCTION atom_or #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define FUNCTION atom_or +#define __CLC_FUNCTION atom_or #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define FUNCTION atom_or +#define __CLC_FUNCTION atom_or #include #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_sub.h b/libclc/opencl/include/clc/opencl/atomic/atom_sub.h index eb17aa8ce73f..9763ee603341 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_sub.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_sub.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_base_atomics -#define FUNCTION atom_sub +#define __CLC_FUNCTION atom_sub #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -#define FUNCTION atom_sub +#define __CLC_FUNCTION atom_sub #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_base_atomics #ifdef cl_khr_int64_base_atomics -#define FUNCTION atom_sub +#define __CLC_FUNCTION atom_sub #include #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_xchg.h b/libclc/opencl/include/clc/opencl/atomic/atom_xchg.h index df17d31d6e67..ab2c5b5156d6 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_xchg.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_xchg.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_base_atomics -#define FUNCTION atom_xchg +#define __CLC_FUNCTION atom_xchg #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -#define FUNCTION atom_xchg +#define __CLC_FUNCTION atom_xchg #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_base_atomics #ifdef cl_khr_int64_base_atomics -#define FUNCTION atom_xchg +#define __CLC_FUNCTION atom_xchg #include #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atom_xor.h b/libclc/opencl/include/clc/opencl/atomic/atom_xor.h index 8a9781882158..b33168db72cf 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atom_xor.h +++ b/libclc/opencl/include/clc/opencl/atomic/atom_xor.h @@ -12,19 +12,19 @@ #include #ifdef cl_khr_global_int32_extended_atomics -#define FUNCTION atom_xor +#define __CLC_FUNCTION atom_xor #define __CLC_ADDRESS_SPACE global #include #endif // cl_khr_global_int32_extended_atomics #ifdef cl_khr_local_int32_extended_atomics -#define FUNCTION atom_xor +#define __CLC_FUNCTION atom_xor #define __CLC_ADDRESS_SPACE local #include #endif // cl_khr_local_int32_extended_atomics #ifdef cl_khr_int64_extended_atomics -#define FUNCTION atom_xor +#define __CLC_FUNCTION atom_xor #include #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h index 50fb99d1362f..edb13bd6bde4 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_add.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_add.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_add +#define __CLC_FUNCTION atomic_add #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_ADD_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h index 8ce328c9739a..4875f5374c69 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_and.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_and.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_and +#define __CLC_FUNCTION atomic_and #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_AND_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h index 76eeda7ba346..59bfa0e87dd8 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_strong.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__ -#define FUNCTION atomic_compare_exchange_strong +#define __CLC_FUNCTION atomic_compare_exchange_strong #define __CLC_COMPARE_EXCHANGE #define __CLC_BODY @@ -19,6 +19,6 @@ #include #undef __CLC_COMPARE_EXCHANGE -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_STRONG_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h index 12788ad03a2d..7106c3e061d6 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_compare_exchange_weak.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__ -#define FUNCTION atomic_compare_exchange_weak +#define __CLC_FUNCTION atomic_compare_exchange_weak #define __CLC_COMPARE_EXCHANGE #define __CLC_BODY @@ -19,6 +19,6 @@ #include #undef __CLC_COMPARE_EXCHANGE -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_COMPARE_EXCHANGE_WEAK_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc index 1b2bf17bd6df..38d250f0693f 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl.inc @@ -10,36 +10,37 @@ #if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) && \ defined(cl_khr_int64_extended_atomics)) -#define HAVE_64_ATOMIC +#define __CLC_HAVE_64_ATOMIC #endif -#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC)) -#define HAVE_FP_ATOMIC +#if defined(__CLC_FPSIZE) && \ + (__CLC_FPSIZE < 64 || defined(__CLC_HAVE_64_ATOMIC)) +#define __CLC_HAVE_FP_ATOMIC #endif #if defined(__CLC_GENSIZE) && \ ((__CLC_GENSIZE == 32) || \ - (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC))) -#define HAVE_INT_ATOMIC + (__CLC_GENSIZE == 64 && defined(__CLC_HAVE_64_ATOMIC))) +#define __CLC_HAVE_INT_ATOMIC #endif -#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC) +#if defined(__CLC_HAVE_FP_ATOMIC) || defined(__CLC_HAVE_INT_ATOMIC) #define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE) #ifdef __CLC_NO_VALUE_ARG #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr); #elif defined(__CLC_RETURN_VOID) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL void FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL void __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value); #elif defined(__CLC_COMPARE_EXCHANGE) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, \ ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired); #else #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value); #endif @@ -51,10 +52,10 @@ __CLC_DEFINE_ATOMIC() #undef __CLC_DEFINE_ATOMIC -#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC +#endif // __CLC_HAVE_FP_ATOMIC || __CLC_HAVE_INT_ATOMIC -#undef HAVE_INT_ATOMIC -#undef HAVE_FP_ATOMIC -#undef HAVE_64_ATOMIC +#undef __CLC_HAVE_INT_ATOMIC +#undef __CLC_HAVE_FP_ATOMIC +#undef __CLC_HAVE_64_ATOMIC #endif // __CLC_SCALAR diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc index e060e3aaea16..0cfd4c3eab5f 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_decl_legacy.inc @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #define __CLC_DECLARE_ATOMIC(ADDRSPACE, TYPE) \ - _CLC_OVERLOAD _CLC_DECL TYPE FUNCTION(volatile ADDRSPACE TYPE *, TYPE); + _CLC_OVERLOAD _CLC_DECL TYPE __CLC_FUNCTION(volatile ADDRSPACE TYPE *, TYPE); #define __CLC_DECLARE_ATOMIC_ADDRSPACE(TYPE) \ __CLC_DECLARE_ATOMIC(global, TYPE) \ @@ -19,4 +19,4 @@ __CLC_DECLARE_ATOMIC_ADDRSPACE(uint) #undef __CLC_DECLARE_ATOMIC_ADDRSPACE #undef __CLC_DECLARE_ATOMIC -#undef FUNCTION +#undef __CLC_FUNCTION diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h index 3949bc13401f..9d949825b58c 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_exchange.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__ -#define FUNCTION atomic_exchange +#define __CLC_FUNCTION atomic_exchange #define __CLC_BODY #include @@ -17,6 +17,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_EXCHANGE_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h index 972c1fa69fe7..bae5a7a7e19b 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_add.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__ -#define FUNCTION atomic_fetch_add +#define __CLC_FUNCTION atomic_fetch_add #define __CLC_BODY #include @@ -17,6 +17,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_ADD_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h index fdac049a74d3..9f9d2225f910 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_and.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__ -#define FUNCTION atomic_fetch_and +#define __CLC_FUNCTION atomic_fetch_and #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_AND_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h index 513b60fec272..bef102dc82f4 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_max.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__ -#define FUNCTION atomic_fetch_max +#define __CLC_FUNCTION atomic_fetch_max #define __CLC_BODY #include @@ -17,6 +17,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MAX_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h index c961c4a64165..d7e346dc4436 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_min.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__ -#define FUNCTION atomic_fetch_min +#define __CLC_FUNCTION atomic_fetch_min #define __CLC_BODY #include @@ -17,6 +17,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_MIN_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h index 25923e3647e3..aa00982e15a5 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_or.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__ -#define FUNCTION atomic_fetch_or +#define __CLC_FUNCTION atomic_fetch_or #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_OR_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h index b307c30a298b..3d04ed7ba34f 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_sub.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__ -#define FUNCTION atomic_fetch_sub +#define __CLC_FUNCTION atomic_fetch_sub #define __CLC_BODY #include @@ -17,6 +17,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_SUB_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h index 52510d018574..2cdff0806902 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_fetch_xor.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__ -#define FUNCTION atomic_fetch_xor +#define __CLC_FUNCTION atomic_fetch_xor #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_FETCH_XOR_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_load.h b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h index 3998a4de9452..7db259b136ec 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_load.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_load.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__ -#define FUNCTION atomic_load +#define __CLC_FUNCTION atomic_load #define __CLC_NO_VALUE_ARG #define __CLC_BODY @@ -19,6 +19,6 @@ #include #undef __CLC_NO_VALUE_ARG -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_LOAD_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h index 6b95ad7e68d9..e16f4ba7122d 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_max.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_max.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_max +#define __CLC_FUNCTION atomic_max #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MAX_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h index c1dfacb40b74..d422388614c2 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_min.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_min.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_min +#define __CLC_FUNCTION atomic_min #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_MIN_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h index 30c32fe4889d..53c9b1e3e89b 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_or.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_or.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_or +#define __CLC_FUNCTION atomic_or #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_OR_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_store.h b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h index 4893a5b88df0..b3cdfc6ffaea 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_store.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_store.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__ #define __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__ -#define FUNCTION atomic_store +#define __CLC_FUNCTION atomic_store #define __CLC_RETURN_VOID #define __CLC_BODY @@ -19,6 +19,6 @@ #include #undef __CLC_RETURN_VOID -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_ATOMIC_ATOMIC_STORE_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h index 1e7ac5505b07..1b77bd7ced46 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_sub.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_sub +#define __CLC_FUNCTION atomic_sub #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_SUB_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h index 043d7825483e..47d4a4638db3 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xchg.h @@ -11,10 +11,10 @@ #include -#define FUNCTION atomic_xchg +#define __CLC_FUNCTION atomic_xchg -_CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile local float *, float); -_CLC_OVERLOAD _CLC_DECL float FUNCTION(volatile global float *, float); +_CLC_OVERLOAD _CLC_DECL float __CLC_FUNCTION(volatile local float *, float); +_CLC_OVERLOAD _CLC_DECL float __CLC_FUNCTION(volatile global float *, float); #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XCHG_H__ diff --git a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h index a9bee007b934..9c808d802594 100644 --- a/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h +++ b/libclc/opencl/include/clc/opencl/atomic/atomic_xor.h @@ -11,7 +11,7 @@ #include -#define FUNCTION atomic_xor +#define __CLC_FUNCTION atomic_xor #include #endif // __CLC_OPENCL_ATOMIC_ATOMIC_XOR_H__ diff --git a/libclc/opencl/include/clc/opencl/common/degrees.h b/libclc/opencl/include/clc/opencl/common/degrees.h index 5bd6f4e8f7fd..8a3e77e559cf 100644 --- a/libclc/opencl/include/clc/opencl/common/degrees.h +++ b/libclc/opencl/include/clc/opencl/common/degrees.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_COMMON_DEGREES_H__ #define __CLC_OPENCL_COMMON_DEGREES_H__ -#define FUNCTION degrees +#define __CLC_FUNCTION degrees #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_COMMON_DEGREES_H__ diff --git a/libclc/opencl/include/clc/opencl/common/radians.h b/libclc/opencl/include/clc/opencl/common/radians.h index 3761a7e875be..accef6eb6260 100644 --- a/libclc/opencl/include/clc/opencl/common/radians.h +++ b/libclc/opencl/include/clc/opencl/common/radians.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_COMMON_RADIANS_H__ #define __CLC_OPENCL_COMMON_RADIANS_H__ -#define FUNCTION radians +#define __CLC_FUNCTION radians #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_COMMON_RADIANS_H__ diff --git a/libclc/opencl/include/clc/opencl/common/sign.h b/libclc/opencl/include/clc/opencl/common/sign.h index 619e68ff1e14..58669b4e1fc9 100644 --- a/libclc/opencl/include/clc/opencl/common/sign.h +++ b/libclc/opencl/include/clc/opencl/common/sign.h @@ -9,9 +9,9 @@ #ifndef __CLC_OPENCL_COMMON_SIGN_H__ #define __CLC_OPENCL_COMMON_SIGN_H__ -#define FUNCTION sign +#define __CLC_FUNCTION sign #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_COMMON_SIGN_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/distance.h b/libclc/opencl/include/clc/opencl/geometric/distance.h index f9c219c44a49..dc5abf11c10c 100644 --- a/libclc/opencl/include/clc/opencl/geometric/distance.h +++ b/libclc/opencl/include/clc/opencl/geometric/distance.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_GEOMETRIC_DISTANCE_H__ #define __CLC_OPENCL_GEOMETRIC_DISTANCE_H__ -#define FUNCTION distance +#define __CLC_FUNCTION distance #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_GEOMETRIC_DISTANCE_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/dot.h b/libclc/opencl/include/clc/opencl/geometric/dot.h index 0b391a10aa07..8cd702961fa4 100644 --- a/libclc/opencl/include/clc/opencl/geometric/dot.h +++ b/libclc/opencl/include/clc/opencl/geometric/dot.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_GEOMETRIC_DOT_H__ #define __CLC_OPENCL_GEOMETRIC_DOT_H__ -#define FUNCTION dot +#define __CLC_FUNCTION dot #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_GEOMETRIC_DOT_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/fast_distance.h b/libclc/opencl/include/clc/opencl/geometric/fast_distance.h index 95b4966dfac1..3eb7d70cdddd 100644 --- a/libclc/opencl/include/clc/opencl/geometric/fast_distance.h +++ b/libclc/opencl/include/clc/opencl/geometric/fast_distance.h @@ -9,12 +9,12 @@ #ifndef __CLC_OPENCL_GEOMETRIC_FAST_DISTANCE_H__ #define __CLC_OPENCL_GEOMETRIC_FAST_DISTANCE_H__ -#define __FLOAT_ONLY -#define FUNCTION fast_distance +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fast_distance #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_GEOMETRIC_FAST_DISTANCE_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/fast_length.h b/libclc/opencl/include/clc/opencl/geometric/fast_length.h index 78ad1fea981c..ff383de9ec72 100644 --- a/libclc/opencl/include/clc/opencl/geometric/fast_length.h +++ b/libclc/opencl/include/clc/opencl/geometric/fast_length.h @@ -9,12 +9,12 @@ #ifndef __CLC_OPENCL_GEOMETRIC_FAST_LENGTH_H__ #define __CLC_OPENCL_GEOMETRIC_FAST_LENGTH_H__ -#define __FLOAT_ONLY -#define FUNCTION fast_length +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fast_length #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_GEOMETRIC_FAST_LENGTH_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/fast_normalize.h b/libclc/opencl/include/clc/opencl/geometric/fast_normalize.h index 7a70dd5df519..fb677d7d3f79 100644 --- a/libclc/opencl/include/clc/opencl/geometric/fast_normalize.h +++ b/libclc/opencl/include/clc/opencl/geometric/fast_normalize.h @@ -9,14 +9,14 @@ #ifndef __CLC_OPENCL_GEOMETRIC_FAST_NORMALIZE_H__ #define __CLC_OPENCL_GEOMETRIC_FAST_NORMALIZE_H__ -#define __FLOAT_ONLY -#define FUNCTION fast_normalize +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fast_normalize #define __CLC_GEOMETRIC_RET_GENTYPE #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #undef __CLC_GEOMETRIC_RET_GENTYPE #endif // __CLC_OPENCL_GEOMETRIC_FAST_NORMALIZE_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/length.h b/libclc/opencl/include/clc/opencl/geometric/length.h index 5dd6ff5a9cad..686d554d6042 100644 --- a/libclc/opencl/include/clc/opencl/geometric/length.h +++ b/libclc/opencl/include/clc/opencl/geometric/length.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_GEOMETRIC_LENGTH_H__ #define __CLC_OPENCL_GEOMETRIC_LENGTH_H__ -#define FUNCTION length +#define __CLC_FUNCTION length #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_GEOMETRIC_LENGTH_H__ diff --git a/libclc/opencl/include/clc/opencl/geometric/normalize.h b/libclc/opencl/include/clc/opencl/geometric/normalize.h index 7d3e8fa98f8e..3ed6dae53269 100644 --- a/libclc/opencl/include/clc/opencl/geometric/normalize.h +++ b/libclc/opencl/include/clc/opencl/geometric/normalize.h @@ -9,13 +9,13 @@ #ifndef __CLC_OPENCL_GEOMETRIC_NORMALIZE_H__ #define __CLC_OPENCL_GEOMETRIC_NORMALIZE_H__ -#define FUNCTION normalize +#define __CLC_FUNCTION normalize #define __CLC_GEOMETRIC_RET_GENTYPE #define __CLC_BODY #include #undef __CLC_GEOMETRIC_RET_GENTYPE -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_GEOMETRIC_NORMALIZE_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/add_sat.h b/libclc/opencl/include/clc/opencl/integer/add_sat.h index cd2c17a09629..e77eee0c0b05 100644 --- a/libclc/opencl/include/clc/opencl/integer/add_sat.h +++ b/libclc/opencl/include/clc/opencl/integer/add_sat.h @@ -11,11 +11,11 @@ #include -#define FUNCTION add_sat +#define __CLC_FUNCTION add_sat #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_ADD_SAT_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/bit_reverse.h b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h index 46b589557631..9c3e45d08519 100644 --- a/libclc/opencl/include/clc/opencl/integer/bit_reverse.h +++ b/libclc/opencl/include/clc/opencl/integer/bit_reverse.h @@ -13,12 +13,12 @@ #include -#define FUNCTION bit_reverse +#define __CLC_FUNCTION bit_reverse #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h index 0a902b2a21d6..c6e48d9820db 100644 --- a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h +++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_signed.h @@ -13,14 +13,14 @@ #include -#define FUNCTION bitfield_extract_signed -#define __RETTYPE __CLC_S_GENTYPE +#define __CLC_FUNCTION bitfield_extract_signed +#define __CLC_RETTYPE __CLC_S_GENTYPE #define __CLC_BODY #include -#undef __RETTYPE -#undef FUNCTION +#undef __CLC_RETTYPE +#undef __CLC_FUNCTION #endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h index 28064c08b113..63dd4368f97a 100644 --- a/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h +++ b/libclc/opencl/include/clc/opencl/integer/bitfield_extract_unsigned.h @@ -13,14 +13,14 @@ #include -#define FUNCTION bitfield_extract_unsigned -#define __RETTYPE __CLC_U_GENTYPE +#define __CLC_FUNCTION bitfield_extract_unsigned +#define __CLC_RETTYPE __CLC_U_GENTYPE #define __CLC_BODY #include -#undef __RETTYPE -#undef FUNCTION +#undef __CLC_RETTYPE +#undef __CLC_FUNCTION #endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h index e77d7a4f0b95..40c7d5982585 100644 --- a/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h +++ b/libclc/opencl/include/clc/opencl/integer/bitfield_insert.h @@ -16,7 +16,7 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // cl_khr_extended_bit_ops diff --git a/libclc/opencl/include/clc/opencl/integer/clz.h b/libclc/opencl/include/clc/opencl/integer/clz.h index 77032ceeaf0f..1b47e975b52a 100644 --- a/libclc/opencl/include/clc/opencl/integer/clz.h +++ b/libclc/opencl/include/clc/opencl/integer/clz.h @@ -11,11 +11,11 @@ #include -#define FUNCTION clz +#define __CLC_FUNCTION clz #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_CLZ_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/ctz.h b/libclc/opencl/include/clc/opencl/integer/ctz.h index b861225846ee..ba0116140730 100644 --- a/libclc/opencl/include/clc/opencl/integer/ctz.h +++ b/libclc/opencl/include/clc/opencl/integer/ctz.h @@ -13,12 +13,12 @@ #include -#define FUNCTION ctz +#define __CLC_FUNCTION ctz #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __OPENCL_C_VERSION__ >= CL_VERSION_2_0 diff --git a/libclc/opencl/include/clc/opencl/integer/hadd.h b/libclc/opencl/include/clc/opencl/integer/hadd.h index c4c8f7daf401..f200c7019435 100644 --- a/libclc/opencl/include/clc/opencl/integer/hadd.h +++ b/libclc/opencl/include/clc/opencl/integer/hadd.h @@ -11,11 +11,11 @@ #include -#define FUNCTION hadd +#define __CLC_FUNCTION hadd #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_HADD_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/mad24.h b/libclc/opencl/include/clc/opencl/integer/mad24.h index daee9a038347..5bf3d385a432 100644 --- a/libclc/opencl/include/clc/opencl/integer/mad24.h +++ b/libclc/opencl/include/clc/opencl/integer/mad24.h @@ -11,11 +11,11 @@ #include -#define FUNCTION mad24 +#define __CLC_FUNCTION mad24 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_MAD24_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/mad_hi.h b/libclc/opencl/include/clc/opencl/integer/mad_hi.h index bbcdcd9c82eb..98b525ba8268 100644 --- a/libclc/opencl/include/clc/opencl/integer/mad_hi.h +++ b/libclc/opencl/include/clc/opencl/integer/mad_hi.h @@ -11,11 +11,11 @@ #include -#define FUNCTION mad_hi +#define __CLC_FUNCTION mad_hi #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_MAD_HI_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/mad_sat.h b/libclc/opencl/include/clc/opencl/integer/mad_sat.h index 28045477975e..9e2afe4f1d9f 100644 --- a/libclc/opencl/include/clc/opencl/integer/mad_sat.h +++ b/libclc/opencl/include/clc/opencl/integer/mad_sat.h @@ -11,11 +11,11 @@ #include -#define FUNCTION mad_sat +#define __CLC_FUNCTION mad_sat #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_MAD_SAT_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/mul24.h b/libclc/opencl/include/clc/opencl/integer/mul24.h index 6aa799a28b7e..434da55a7173 100644 --- a/libclc/opencl/include/clc/opencl/integer/mul24.h +++ b/libclc/opencl/include/clc/opencl/integer/mul24.h @@ -11,11 +11,11 @@ #include -#define FUNCTION mul24 +#define __CLC_FUNCTION mul24 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_MUL24_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/mul_hi.h b/libclc/opencl/include/clc/opencl/integer/mul_hi.h index ed7ac7319cab..7a9317f94cd2 100644 --- a/libclc/opencl/include/clc/opencl/integer/mul_hi.h +++ b/libclc/opencl/include/clc/opencl/integer/mul_hi.h @@ -11,11 +11,11 @@ #include -#define FUNCTION mul_hi +#define __CLC_FUNCTION mul_hi #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_MUL_HI_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/popcount.h b/libclc/opencl/include/clc/opencl/integer/popcount.h index 43d5bc0f0b33..112640c7ce63 100644 --- a/libclc/opencl/include/clc/opencl/integer/popcount.h +++ b/libclc/opencl/include/clc/opencl/integer/popcount.h @@ -11,11 +11,11 @@ #include -#define FUNCTION popcount +#define __CLC_FUNCTION popcount #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_POPCOUNT_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/rhadd.h b/libclc/opencl/include/clc/opencl/integer/rhadd.h index 2c93c100cf02..5ed8b27df8fc 100644 --- a/libclc/opencl/include/clc/opencl/integer/rhadd.h +++ b/libclc/opencl/include/clc/opencl/integer/rhadd.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_INTEGER_RHADD_H__ #define __CLC_OPENCL_INTEGER_RHADD_H__ -#define FUNCTION rhadd +#define __CLC_FUNCTION rhadd #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_RHADD_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/rotate.h b/libclc/opencl/include/clc/opencl/integer/rotate.h index 156155ae0638..cc92a1254fed 100644 --- a/libclc/opencl/include/clc/opencl/integer/rotate.h +++ b/libclc/opencl/include/clc/opencl/integer/rotate.h @@ -11,11 +11,11 @@ #include -#define FUNCTION rotate +#define __CLC_FUNCTION rotate #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_ROTATE_H__ diff --git a/libclc/opencl/include/clc/opencl/integer/sub_sat.h b/libclc/opencl/include/clc/opencl/integer/sub_sat.h index a7861ee1ac4a..1ff26d94778c 100644 --- a/libclc/opencl/include/clc/opencl/integer/sub_sat.h +++ b/libclc/opencl/include/clc/opencl/integer/sub_sat.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_INTEGER_SUB_SAT_H__ #define __CLC_OPENCL_INTEGER_SUB_SAT_H__ -#define FUNCTION sub_sat +#define __CLC_FUNCTION sub_sat #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_INTEGER_SUB_SAT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/acos.h b/libclc/opencl/include/clc/opencl/math/acos.h index f9e9b44bb904..aa0d7af13ee9 100644 --- a/libclc/opencl/include/clc/opencl/math/acos.h +++ b/libclc/opencl/include/clc/opencl/math/acos.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ACOS_H__ #define __CLC_BODY -#define FUNCTION acos +#define __CLC_FUNCTION acos #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ACOS_H__ diff --git a/libclc/opencl/include/clc/opencl/math/acosh.h b/libclc/opencl/include/clc/opencl/math/acosh.h index 241b305e6bc8..333ab6986d8c 100644 --- a/libclc/opencl/include/clc/opencl/math/acosh.h +++ b/libclc/opencl/include/clc/opencl/math/acosh.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ACOSH_H__ #define __CLC_BODY -#define FUNCTION acosh +#define __CLC_FUNCTION acosh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ACOSH_H__ diff --git a/libclc/opencl/include/clc/opencl/math/acospi.h b/libclc/opencl/include/clc/opencl/math/acospi.h index 7014889dfb7e..fd838acd5e52 100644 --- a/libclc/opencl/include/clc/opencl/math/acospi.h +++ b/libclc/opencl/include/clc/opencl/math/acospi.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ACOSPI_H__ #define __CLC_BODY -#define FUNCTION acospi +#define __CLC_FUNCTION acospi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ACOSPI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/asin.h b/libclc/opencl/include/clc/opencl/math/asin.h index 1ba1f459a902..de3b8ad219d1 100644 --- a/libclc/opencl/include/clc/opencl/math/asin.h +++ b/libclc/opencl/include/clc/opencl/math/asin.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ASIN_H__ #define __CLC_BODY -#define FUNCTION asin +#define __CLC_FUNCTION asin #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ASIN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/asinh.h b/libclc/opencl/include/clc/opencl/math/asinh.h index 9a530a51fa1e..a550ec8dee4c 100644 --- a/libclc/opencl/include/clc/opencl/math/asinh.h +++ b/libclc/opencl/include/clc/opencl/math/asinh.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ASINH_H__ #define __CLC_BODY -#define FUNCTION asinh +#define __CLC_FUNCTION asinh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ASINH_H__ diff --git a/libclc/opencl/include/clc/opencl/math/asinpi.h b/libclc/opencl/include/clc/opencl/math/asinpi.h index 844848fff043..cb6133ddc3f5 100644 --- a/libclc/opencl/include/clc/opencl/math/asinpi.h +++ b/libclc/opencl/include/clc/opencl/math/asinpi.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ASINPI_H__ #define __CLC_BODY -#define FUNCTION asinpi +#define __CLC_FUNCTION asinpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ASINPI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/atan.h b/libclc/opencl/include/clc/opencl/math/atan.h index f2312b8fb073..165bbc3dfb18 100644 --- a/libclc/opencl/include/clc/opencl/math/atan.h +++ b/libclc/opencl/include/clc/opencl/math/atan.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ATAN_H__ #define __CLC_BODY -#define FUNCTION atan +#define __CLC_FUNCTION atan #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ATAN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/atan2.h b/libclc/opencl/include/clc/opencl/math/atan2.h index 72412d44aafb..287487670709 100644 --- a/libclc/opencl/include/clc/opencl/math/atan2.h +++ b/libclc/opencl/include/clc/opencl/math/atan2.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_ATAN2_H__ #define __CLC_OPENCL_MATH_ATAN2_H__ -#define FUNCTION atan2 +#define __CLC_FUNCTION atan2 #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ATAN2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/atan2pi.h b/libclc/opencl/include/clc/opencl/math/atan2pi.h index f12a19776ef2..7c09a94fffa8 100644 --- a/libclc/opencl/include/clc/opencl/math/atan2pi.h +++ b/libclc/opencl/include/clc/opencl/math/atan2pi.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_ATAN2PI_H__ #define __CLC_OPENCL_MATH_ATAN2PI_H__ -#define FUNCTION atan2pi +#define __CLC_FUNCTION atan2pi #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ATAN2PI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/atanh.h b/libclc/opencl/include/clc/opencl/math/atanh.h index 147a76b543e6..ccfe41fdb2cf 100644 --- a/libclc/opencl/include/clc/opencl/math/atanh.h +++ b/libclc/opencl/include/clc/opencl/math/atanh.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ATANH_H__ #define __CLC_BODY -#define FUNCTION atanh +#define __CLC_FUNCTION atanh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ATANH_H__ diff --git a/libclc/opencl/include/clc/opencl/math/atanpi.h b/libclc/opencl/include/clc/opencl/math/atanpi.h index 12e9ea250170..8041cc3e813e 100644 --- a/libclc/opencl/include/clc/opencl/math/atanpi.h +++ b/libclc/opencl/include/clc/opencl/math/atanpi.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ATANPI_H__ #define __CLC_BODY -#define FUNCTION atanpi +#define __CLC_FUNCTION atanpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ATANPI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/cbrt.h b/libclc/opencl/include/clc/opencl/math/cbrt.h index fdb408ac9685..3beb5b4b596a 100644 --- a/libclc/opencl/include/clc/opencl/math/cbrt.h +++ b/libclc/opencl/include/clc/opencl/math/cbrt.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_CBRT_H__ #define __CLC_BODY -#define FUNCTION cbrt +#define __CLC_FUNCTION cbrt #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_CBRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/ceil.h b/libclc/opencl/include/clc/opencl/math/ceil.h index dad2ea4f6980..ab32419225ce 100644 --- a/libclc/opencl/include/clc/opencl/math/ceil.h +++ b/libclc/opencl/include/clc/opencl/math/ceil.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_CEIL_H__ #define __CLC_BODY -#define FUNCTION ceil +#define __CLC_FUNCTION ceil #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_CEIL_H__ diff --git a/libclc/opencl/include/clc/opencl/math/copysign.h b/libclc/opencl/include/clc/opencl/math/copysign.h index 10d6c90edf81..626ce2e01cc3 100644 --- a/libclc/opencl/include/clc/opencl/math/copysign.h +++ b/libclc/opencl/include/clc/opencl/math/copysign.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_COPYSIGN_H__ #define __CLC_OPENCL_MATH_COPYSIGN_H__ -#define FUNCTION copysign +#define __CLC_FUNCTION copysign #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_COPYSIGN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/cos.h b/libclc/opencl/include/clc/opencl/math/cos.h index 4e3aec6c6962..8d26f0f78d2d 100644 --- a/libclc/opencl/include/clc/opencl/math/cos.h +++ b/libclc/opencl/include/clc/opencl/math/cos.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_COS_H__ #define __CLC_BODY -#define FUNCTION cos +#define __CLC_FUNCTION cos #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_COS_H__ diff --git a/libclc/opencl/include/clc/opencl/math/cosh.h b/libclc/opencl/include/clc/opencl/math/cosh.h index f316433a2351..2afe77fae6c7 100644 --- a/libclc/opencl/include/clc/opencl/math/cosh.h +++ b/libclc/opencl/include/clc/opencl/math/cosh.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_COSH_H__ #define __CLC_BODY -#define FUNCTION cosh +#define __CLC_FUNCTION cosh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_COSH_H__ diff --git a/libclc/opencl/include/clc/opencl/math/cospi.h b/libclc/opencl/include/clc/opencl/math/cospi.h index b7dfb0dbafb5..8e848ac0f883 100644 --- a/libclc/opencl/include/clc/opencl/math/cospi.h +++ b/libclc/opencl/include/clc/opencl/math/cospi.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_COSPI_H__ #define __CLC_BODY -#define FUNCTION cospi +#define __CLC_FUNCTION cospi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_COSPI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/erf.h b/libclc/opencl/include/clc/opencl/math/erf.h index 8ce262f276e3..71c211e693bd 100644 --- a/libclc/opencl/include/clc/opencl/math/erf.h +++ b/libclc/opencl/include/clc/opencl/math/erf.h @@ -12,10 +12,10 @@ #undef erfc #define __CLC_BODY -#define FUNCTION erf +#define __CLC_FUNCTION erf #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ERF_H__ diff --git a/libclc/opencl/include/clc/opencl/math/erfc.h b/libclc/opencl/include/clc/opencl/math/erfc.h index 26afe1315df6..393910882045 100644 --- a/libclc/opencl/include/clc/opencl/math/erfc.h +++ b/libclc/opencl/include/clc/opencl/math/erfc.h @@ -12,10 +12,10 @@ #undef erfc #define __CLC_BODY -#define FUNCTION erfc +#define __CLC_FUNCTION erfc #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ERFC_H__ diff --git a/libclc/opencl/include/clc/opencl/math/exp.h b/libclc/opencl/include/clc/opencl/math/exp.h index fb8f674a8c7e..acf6d4885dbd 100644 --- a/libclc/opencl/include/clc/opencl/math/exp.h +++ b/libclc/opencl/include/clc/opencl/math/exp.h @@ -12,10 +12,10 @@ #undef exp #define __CLC_BODY -#define FUNCTION exp +#define __CLC_FUNCTION exp #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_EXP_H__ diff --git a/libclc/opencl/include/clc/opencl/math/exp10.h b/libclc/opencl/include/clc/opencl/math/exp10.h index cfd5c6df9d15..11dcae74e061 100644 --- a/libclc/opencl/include/clc/opencl/math/exp10.h +++ b/libclc/opencl/include/clc/opencl/math/exp10.h @@ -12,10 +12,10 @@ #undef exp10 #define __CLC_BODY -#define FUNCTION exp10 +#define __CLC_FUNCTION exp10 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_EXP10_H__ diff --git a/libclc/opencl/include/clc/opencl/math/exp2.h b/libclc/opencl/include/clc/opencl/math/exp2.h index db0405eb1334..e220654ac72e 100644 --- a/libclc/opencl/include/clc/opencl/math/exp2.h +++ b/libclc/opencl/include/clc/opencl/math/exp2.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_EXP2_H__ #define __CLC_BODY -#define FUNCTION exp2 +#define __CLC_FUNCTION exp2 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_EXP2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/expm1.h b/libclc/opencl/include/clc/opencl/math/expm1.h index 0b3628d48468..556dbea8fae0 100644 --- a/libclc/opencl/include/clc/opencl/math/expm1.h +++ b/libclc/opencl/include/clc/opencl/math/expm1.h @@ -12,10 +12,10 @@ #undef exp #define __CLC_BODY -#define FUNCTION expm1 +#define __CLC_FUNCTION expm1 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_EXPM1_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fabs.h b/libclc/opencl/include/clc/opencl/math/fabs.h index f20016f4eb8b..ad8c05198a70 100644 --- a/libclc/opencl/include/clc/opencl/math/fabs.h +++ b/libclc/opencl/include/clc/opencl/math/fabs.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_FABS_H__ #define __CLC_BODY -#define FUNCTION fabs +#define __CLC_FUNCTION fabs #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FABS_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fdim.h b/libclc/opencl/include/clc/opencl/math/fdim.h index 82f70fd83236..ed30904e3c24 100644 --- a/libclc/opencl/include/clc/opencl/math/fdim.h +++ b/libclc/opencl/include/clc/opencl/math/fdim.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_FDIM_H__ #define __CLC_OPENCL_MATH_FDIM_H__ -#define FUNCTION fdim +#define __CLC_FUNCTION fdim #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FDIM_H__ diff --git a/libclc/opencl/include/clc/opencl/math/floor.h b/libclc/opencl/include/clc/opencl/math/floor.h index 1930aeca74ff..8d7a0358c465 100644 --- a/libclc/opencl/include/clc/opencl/math/floor.h +++ b/libclc/opencl/include/clc/opencl/math/floor.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_FLOOR_H__ #define __CLC_BODY -#define FUNCTION floor +#define __CLC_FUNCTION floor #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FLOOR_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fma.h b/libclc/opencl/include/clc/opencl/math/fma.h index 6fed3fc9b256..7a002318384a 100644 --- a/libclc/opencl/include/clc/opencl/math/fma.h +++ b/libclc/opencl/include/clc/opencl/math/fma.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_FMA_H__ #define __CLC_BODY -#define FUNCTION fma +#define __CLC_FUNCTION fma #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FMA_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fmax.h b/libclc/opencl/include/clc/opencl/math/fmax.h index 84c754e2c0c7..8ada84403e0a 100644 --- a/libclc/opencl/include/clc/opencl/math/fmax.h +++ b/libclc/opencl/include/clc/opencl/math/fmax.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_FMAX_H__ #define __CLC_BODY -#define FUNCTION fmax +#define __CLC_FUNCTION fmax #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FMAX_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fmin.h b/libclc/opencl/include/clc/opencl/math/fmin.h index b35573e4f633..3a464a63db73 100644 --- a/libclc/opencl/include/clc/opencl/math/fmin.h +++ b/libclc/opencl/include/clc/opencl/math/fmin.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_FMIN_H__ #define __CLC_BODY -#define FUNCTION fmin +#define __CLC_FUNCTION fmin #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FMIN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fmod.h b/libclc/opencl/include/clc/opencl/math/fmod.h index 13515006fd2b..f826c8a6f6c3 100644 --- a/libclc/opencl/include/clc/opencl/math/fmod.h +++ b/libclc/opencl/include/clc/opencl/math/fmod.h @@ -9,9 +9,9 @@ #ifndef __CLC_OPENCL_MATH_FMOD_H__ #define __CLC_OPENCL_MATH_FMOD_H__ -#define FUNCTION fmod +#define __CLC_FUNCTION fmod #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FMOD_H__ diff --git a/libclc/opencl/include/clc/opencl/math/fract.h b/libclc/opencl/include/clc/opencl/math/fract.h index de953f78c594..342280f70ed6 100644 --- a/libclc/opencl/include/clc/opencl/math/fract.h +++ b/libclc/opencl/include/clc/opencl/math/fract.h @@ -9,9 +9,9 @@ #ifndef __CLC_OPENCL_MATH_FRACT_H__ #define __CLC_OPENCL_MATH_FRACT_H__ -#define FUNCTION fract +#define __CLC_FUNCTION fract #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FRACT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/frexp.h b/libclc/opencl/include/clc/opencl/math/frexp.h index c7ed7cbd1cbc..9f0701aff10a 100644 --- a/libclc/opencl/include/clc/opencl/math/frexp.h +++ b/libclc/opencl/include/clc/opencl/math/frexp.h @@ -9,10 +9,10 @@ #ifndef __CLC_OPENCL_MATH_FREXP_H__ #define __CLC_OPENCL_MATH_FREXP_H__ -#define FUNCTION frexp +#define __CLC_FUNCTION frexp #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_FREXP_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_cos.h b/libclc/opencl/include/clc/opencl/math/half_cos.h index 5ab0b69ca422..9f95f5d7e277 100644 --- a/libclc/opencl/include/clc/opencl/math/half_cos.h +++ b/libclc/opencl/include/clc/opencl/math/half_cos.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_COS_H__ #define __CLC_BODY -#define FUNCTION half_cos -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_cos +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_COS_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_divide.h b/libclc/opencl/include/clc/opencl/math/half_divide.h index ff506d0c9fd6..c61fb08e2119 100644 --- a/libclc/opencl/include/clc/opencl/math/half_divide.h +++ b/libclc/opencl/include/clc/opencl/math/half_divide.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_HALF_DIVIDE_H__ #define __CLC_BODY -#define FUNCTION half_divide +#define __CLC_FUNCTION half_divide #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_DIVIDE_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_exp.h b/libclc/opencl/include/clc/opencl/math/half_exp.h index 22780c144923..53b75a62f4df 100644 --- a/libclc/opencl/include/clc/opencl/math/half_exp.h +++ b/libclc/opencl/include/clc/opencl/math/half_exp.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_EXP_H__ #define __CLC_BODY -#define FUNCTION half_exp -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_exp +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_EXP_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_exp10.h b/libclc/opencl/include/clc/opencl/math/half_exp10.h index 1a8195e67eee..88b05a4df081 100644 --- a/libclc/opencl/include/clc/opencl/math/half_exp10.h +++ b/libclc/opencl/include/clc/opencl/math/half_exp10.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_EXP10_H__ #define __CLC_BODY -#define FUNCTION half_exp10 -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_exp10 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_EXP10_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_exp2.h b/libclc/opencl/include/clc/opencl/math/half_exp2.h index 2ca69aa1ea07..e440b996e58a 100644 --- a/libclc/opencl/include/clc/opencl/math/half_exp2.h +++ b/libclc/opencl/include/clc/opencl/math/half_exp2.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_EXP2_H__ #define __CLC_BODY -#define FUNCTION half_exp2 -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_exp2 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_EXP2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_log.h b/libclc/opencl/include/clc/opencl/math/half_log.h index d2eef868790f..d4432312c6af 100644 --- a/libclc/opencl/include/clc/opencl/math/half_log.h +++ b/libclc/opencl/include/clc/opencl/math/half_log.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_LOG_H__ #define __CLC_BODY -#define FUNCTION half_log -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_log +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_LOG_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_log10.h b/libclc/opencl/include/clc/opencl/math/half_log10.h index e0f85fff3f3e..f219f950f949 100644 --- a/libclc/opencl/include/clc/opencl/math/half_log10.h +++ b/libclc/opencl/include/clc/opencl/math/half_log10.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_LOG10_H__ #define __CLC_BODY -#define FUNCTION half_log10 -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_log10 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_LOG10_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_log2.h b/libclc/opencl/include/clc/opencl/math/half_log2.h index ba0105d88356..932f546e8761 100644 --- a/libclc/opencl/include/clc/opencl/math/half_log2.h +++ b/libclc/opencl/include/clc/opencl/math/half_log2.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_LOG2_H__ #define __CLC_BODY -#define FUNCTION half_log2 -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_log2 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_LOG2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_powr.h b/libclc/opencl/include/clc/opencl/math/half_powr.h index 4b792c0c6cba..e9af720aa5ac 100644 --- a/libclc/opencl/include/clc/opencl/math/half_powr.h +++ b/libclc/opencl/include/clc/opencl/math/half_powr.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_HALF_POWR_H__ #define __CLC_BODY -#define FUNCTION half_powr +#define __CLC_FUNCTION half_powr #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_POWR_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_recip.h b/libclc/opencl/include/clc/opencl/math/half_recip.h index 3425df4e476d..6fb7f02138a6 100644 --- a/libclc/opencl/include/clc/opencl/math/half_recip.h +++ b/libclc/opencl/include/clc/opencl/math/half_recip.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_RECIP_H__ #define __CLC_BODY -#define FUNCTION half_recip -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_recip +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_RECIP_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_rsqrt.h b/libclc/opencl/include/clc/opencl/math/half_rsqrt.h index e22a7375cacd..a130d3bb080a 100644 --- a/libclc/opencl/include/clc/opencl/math/half_rsqrt.h +++ b/libclc/opencl/include/clc/opencl/math/half_rsqrt.h @@ -10,9 +10,9 @@ #define __CLC_OPENCL_MATH_HALF_RSQRT_H__ #define __CLC_BODY -#define FUNCTION half_rsqrt -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_rsqrt +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_RSQRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_sin.h b/libclc/opencl/include/clc/opencl/math/half_sin.h index cb0462c45436..cde9549bd3b3 100644 --- a/libclc/opencl/include/clc/opencl/math/half_sin.h +++ b/libclc/opencl/include/clc/opencl/math/half_sin.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_SIN_H__ #define __CLC_BODY -#define FUNCTION half_sin -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_sin +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_SIN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_sqrt.h b/libclc/opencl/include/clc/opencl/math/half_sqrt.h index f755fee7b4be..017bed1c066e 100644 --- a/libclc/opencl/include/clc/opencl/math/half_sqrt.h +++ b/libclc/opencl/include/clc/opencl/math/half_sqrt.h @@ -10,9 +10,9 @@ #define __CLC_OPENCL_MATH_HALF_SQRT_H__ #define __CLC_BODY -#define FUNCTION half_sqrt -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_sqrt +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_SQRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/half_tan.h b/libclc/opencl/include/clc/opencl/math/half_tan.h index 91b5b11ae178..b8e71fcba43d 100644 --- a/libclc/opencl/include/clc/opencl/math/half_tan.h +++ b/libclc/opencl/include/clc/opencl/math/half_tan.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_HALF_TAN_H__ #define __CLC_BODY -#define FUNCTION half_tan -#define __FLOAT_ONLY +#define __CLC_FUNCTION half_tan +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HALF_TAN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/hypot.h b/libclc/opencl/include/clc/opencl/math/hypot.h index 76ff6ef0655a..53bef98772e4 100644 --- a/libclc/opencl/include/clc/opencl/math/hypot.h +++ b/libclc/opencl/include/clc/opencl/math/hypot.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_HYPOT_H__ #define __CLC_OPENCL_MATH_HYPOT_H__ -#define FUNCTION hypot +#define __CLC_FUNCTION hypot #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_HYPOT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/ilogb.h b/libclc/opencl/include/clc/opencl/math/ilogb.h index d9752a644bee..29d92632f7f5 100644 --- a/libclc/opencl/include/clc/opencl/math/ilogb.h +++ b/libclc/opencl/include/clc/opencl/math/ilogb.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_ILOGB_H__ #define __CLC_OPENCL_MATH_ILOGB_H__ -#define FUNCTION ilogb +#define __CLC_FUNCTION ilogb #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ILOGB_H__ diff --git a/libclc/opencl/include/clc/opencl/math/ldexp.h b/libclc/opencl/include/clc/opencl/math/ldexp.h index b0194d9c9ada..cbbd030d7340 100644 --- a/libclc/opencl/include/clc/opencl/math/ldexp.h +++ b/libclc/opencl/include/clc/opencl/math/ldexp.h @@ -9,10 +9,10 @@ #ifndef __CLC_OPENCL_MATH_LDEXP_H__ #define __CLC_OPENCL_MATH_LDEXP_H__ -#define FUNCTION ldexp +#define __CLC_FUNCTION ldexp #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #define __CLC_BODY #include diff --git a/libclc/opencl/include/clc/opencl/math/lgamma.h b/libclc/opencl/include/clc/opencl/math/lgamma.h index 95ed07c3531e..42c260f65942 100644 --- a/libclc/opencl/include/clc/opencl/math/lgamma.h +++ b/libclc/opencl/include/clc/opencl/math/lgamma.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_LGAMMA_H__ #define __CLC_BODY -#define FUNCTION lgamma +#define __CLC_FUNCTION lgamma #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LGAMMA_H__ diff --git a/libclc/opencl/include/clc/opencl/math/lgamma_r.h b/libclc/opencl/include/clc/opencl/math/lgamma_r.h index 4853d2dd5c28..03f47b30fb71 100644 --- a/libclc/opencl/include/clc/opencl/math/lgamma_r.h +++ b/libclc/opencl/include/clc/opencl/math/lgamma_r.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_LGAMMA_R_H__ #define __CLC_OPENCL_MATH_LGAMMA_R_H__ -#define FUNCTION lgamma_r +#define __CLC_FUNCTION lgamma_r #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LGAMMA_R_H__ diff --git a/libclc/opencl/include/clc/opencl/math/log.h b/libclc/opencl/include/clc/opencl/math/log.h index 2d536d6b0926..f863dee3e8f2 100644 --- a/libclc/opencl/include/clc/opencl/math/log.h +++ b/libclc/opencl/include/clc/opencl/math/log.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_LOG_H__ #define __CLC_BODY -#define FUNCTION log +#define __CLC_FUNCTION log #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LOG_H__ diff --git a/libclc/opencl/include/clc/opencl/math/log10.h b/libclc/opencl/include/clc/opencl/math/log10.h index 0f83b8c2a03e..d328057ecf5e 100644 --- a/libclc/opencl/include/clc/opencl/math/log10.h +++ b/libclc/opencl/include/clc/opencl/math/log10.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_LOG10_H__ #define __CLC_BODY -#define FUNCTION log10 +#define __CLC_FUNCTION log10 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LOG10_H__ diff --git a/libclc/opencl/include/clc/opencl/math/log1p.h b/libclc/opencl/include/clc/opencl/math/log1p.h index bd8adeda88d4..0dba062aa9fd 100644 --- a/libclc/opencl/include/clc/opencl/math/log1p.h +++ b/libclc/opencl/include/clc/opencl/math/log1p.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_LOG1P_H__ #define __CLC_BODY -#define FUNCTION log1p +#define __CLC_FUNCTION log1p #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LOG1P_H__ diff --git a/libclc/opencl/include/clc/opencl/math/log2.h b/libclc/opencl/include/clc/opencl/math/log2.h index e4f6cb619acf..3903b2730e29 100644 --- a/libclc/opencl/include/clc/opencl/math/log2.h +++ b/libclc/opencl/include/clc/opencl/math/log2.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_LOG2_H__ #define __CLC_BODY -#define FUNCTION log2 +#define __CLC_FUNCTION log2 #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LOG2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/logb.h b/libclc/opencl/include/clc/opencl/math/logb.h index 3bbc8476d2ff..ef4f7b763c59 100644 --- a/libclc/opencl/include/clc/opencl/math/logb.h +++ b/libclc/opencl/include/clc/opencl/math/logb.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_LOGB_H__ #define __CLC_BODY -#define FUNCTION logb +#define __CLC_FUNCTION logb #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_LOGB_H__ diff --git a/libclc/opencl/include/clc/opencl/math/mad.h b/libclc/opencl/include/clc/opencl/math/mad.h index 18732eb7be66..440892cfbb17 100644 --- a/libclc/opencl/include/clc/opencl/math/mad.h +++ b/libclc/opencl/include/clc/opencl/math/mad.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_MAD_H__ #define __CLC_BODY -#define FUNCTION mad +#define __CLC_FUNCTION mad #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_MAD_H__ diff --git a/libclc/opencl/include/clc/opencl/math/maxmag.h b/libclc/opencl/include/clc/opencl/math/maxmag.h index c8f3d5700fa2..1f3df1034d8e 100644 --- a/libclc/opencl/include/clc/opencl/math/maxmag.h +++ b/libclc/opencl/include/clc/opencl/math/maxmag.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_MAXMAG_H__ #define __CLC_BODY -#define FUNCTION maxmag +#define __CLC_FUNCTION maxmag #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_MAXMAG_H__ diff --git a/libclc/opencl/include/clc/opencl/math/minmag.h b/libclc/opencl/include/clc/opencl/math/minmag.h index c90a9d9bdbeb..c87fc1a20e40 100644 --- a/libclc/opencl/include/clc/opencl/math/minmag.h +++ b/libclc/opencl/include/clc/opencl/math/minmag.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_MINMAG_H__ #define __CLC_BODY -#define FUNCTION minmag +#define __CLC_FUNCTION minmag #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_MINMAG_H__ diff --git a/libclc/opencl/include/clc/opencl/math/modf.h b/libclc/opencl/include/clc/opencl/math/modf.h index a336c9e242c1..df07f7f5948d 100644 --- a/libclc/opencl/include/clc/opencl/math/modf.h +++ b/libclc/opencl/include/clc/opencl/math/modf.h @@ -9,10 +9,10 @@ #ifndef __CLC_OPENCL_MATH_MODF_H__ #define __CLC_OPENCL_MATH_MODF_H__ -#define FUNCTION modf +#define __CLC_FUNCTION modf #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_MODF_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_cos.h b/libclc/opencl/include/clc/opencl/math/native_cos.h index 566c32e95b32..9c6728103adc 100644 --- a/libclc/opencl/include/clc/opencl/math/native_cos.h +++ b/libclc/opencl/include/clc/opencl/math/native_cos.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_COS_H__ #define __CLC_BODY -#define FUNCTION native_cos -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_cos +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_COS_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_divide.h b/libclc/opencl/include/clc/opencl/math/native_divide.h index 23360b4bdeb9..66c42f20d222 100644 --- a/libclc/opencl/include/clc/opencl/math/native_divide.h +++ b/libclc/opencl/include/clc/opencl/math/native_divide.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_NATIVE_DIVIDE_H__ #define __CLC_BODY -#define FUNCTION native_divide +#define __CLC_FUNCTION native_divide #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_DIVIDE_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_exp.h b/libclc/opencl/include/clc/opencl/math/native_exp.h index 5133469509ff..d8c6f70d3f2c 100644 --- a/libclc/opencl/include/clc/opencl/math/native_exp.h +++ b/libclc/opencl/include/clc/opencl/math/native_exp.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_EXP_H__ #define __CLC_BODY -#define FUNCTION native_exp -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_exp +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_EXP_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_exp10.h b/libclc/opencl/include/clc/opencl/math/native_exp10.h index 8a2788cd3bca..484eeb5de0a5 100644 --- a/libclc/opencl/include/clc/opencl/math/native_exp10.h +++ b/libclc/opencl/include/clc/opencl/math/native_exp10.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_EXP10_H__ #define __CLC_BODY -#define FUNCTION native_exp10 -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_exp10 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_EXP10_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_exp2.h b/libclc/opencl/include/clc/opencl/math/native_exp2.h index 5a6aed09a838..bc1e6babc62b 100644 --- a/libclc/opencl/include/clc/opencl/math/native_exp2.h +++ b/libclc/opencl/include/clc/opencl/math/native_exp2.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_EXP2_H__ #define __CLC_BODY -#define FUNCTION native_exp2 -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_exp2 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_EXP2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_log.h b/libclc/opencl/include/clc/opencl/math/native_log.h index 58164960644f..c4bf498738f4 100644 --- a/libclc/opencl/include/clc/opencl/math/native_log.h +++ b/libclc/opencl/include/clc/opencl/math/native_log.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_LOG_H__ #define __CLC_BODY -#define FUNCTION native_log -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_log +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_LOG_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_log10.h b/libclc/opencl/include/clc/opencl/math/native_log10.h index c215c733c042..ab61ee3df5c0 100644 --- a/libclc/opencl/include/clc/opencl/math/native_log10.h +++ b/libclc/opencl/include/clc/opencl/math/native_log10.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_LOG10_H__ #define __CLC_BODY -#define FUNCTION native_log10 -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_log10 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_LOG10_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_log2.h b/libclc/opencl/include/clc/opencl/math/native_log2.h index 783abe6cd8e4..6ba0d3bf490f 100644 --- a/libclc/opencl/include/clc/opencl/math/native_log2.h +++ b/libclc/opencl/include/clc/opencl/math/native_log2.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_LOG2_H__ #define __CLC_BODY -#define FUNCTION native_log2 -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_log2 +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_LOG2_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_powr.h b/libclc/opencl/include/clc/opencl/math/native_powr.h index 9c8e171afc21..6b63138aa782 100644 --- a/libclc/opencl/include/clc/opencl/math/native_powr.h +++ b/libclc/opencl/include/clc/opencl/math/native_powr.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_NATIVE_POWR_H__ #define __CLC_BODY -#define FUNCTION native_powr +#define __CLC_FUNCTION native_powr #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_POWR_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_recip.h b/libclc/opencl/include/clc/opencl/math/native_recip.h index 1c88c076e03c..967eea38d80d 100644 --- a/libclc/opencl/include/clc/opencl/math/native_recip.h +++ b/libclc/opencl/include/clc/opencl/math/native_recip.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_RECIP_H__ #define __CLC_BODY -#define FUNCTION native_recip -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_recip +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_RECIP_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_rsqrt.h b/libclc/opencl/include/clc/opencl/math/native_rsqrt.h index 2806f9128d94..9146df891372 100644 --- a/libclc/opencl/include/clc/opencl/math/native_rsqrt.h +++ b/libclc/opencl/include/clc/opencl/math/native_rsqrt.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_RSQRT_H__ #define __CLC_BODY -#define FUNCTION native_rsqrt -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_rsqrt +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_RSQRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_sin.h b/libclc/opencl/include/clc/opencl/math/native_sin.h index 81f664153cee..766b76781a9b 100644 --- a/libclc/opencl/include/clc/opencl/math/native_sin.h +++ b/libclc/opencl/include/clc/opencl/math/native_sin.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_SIN_H__ #define __CLC_BODY -#define FUNCTION native_sin -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_sin +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_SIN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_sqrt.h b/libclc/opencl/include/clc/opencl/math/native_sqrt.h index dca0e542e0ac..5d44f28db928 100644 --- a/libclc/opencl/include/clc/opencl/math/native_sqrt.h +++ b/libclc/opencl/include/clc/opencl/math/native_sqrt.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_SQRT_H__ #define __CLC_BODY -#define FUNCTION native_sqrt -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_sqrt +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_SQRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/native_tan.h b/libclc/opencl/include/clc/opencl/math/native_tan.h index cf942110e312..075aa59757b4 100644 --- a/libclc/opencl/include/clc/opencl/math/native_tan.h +++ b/libclc/opencl/include/clc/opencl/math/native_tan.h @@ -10,11 +10,11 @@ #define __CLC_OPENCL_MATH_NATIVE_TAN_H__ #define __CLC_BODY -#define FUNCTION native_tan -#define __FLOAT_ONLY +#define __CLC_FUNCTION native_tan +#define __CLC_FLOAT_ONLY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NATIVE_TAN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/nextafter.h b/libclc/opencl/include/clc/opencl/math/nextafter.h index e338f5bf6c2a..6843f2977025 100644 --- a/libclc/opencl/include/clc/opencl/math/nextafter.h +++ b/libclc/opencl/include/clc/opencl/math/nextafter.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_NEXTAFTER_H__ #define __CLC_OPENCL_MATH_NEXTAFTER_H__ -#define FUNCTION nextafter +#define __CLC_FUNCTION nextafter #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_NEXTAFTER_H__ diff --git a/libclc/opencl/include/clc/opencl/math/pow.h b/libclc/opencl/include/clc/opencl/math/pow.h index 3f4c1746faa9..ce17b66b541d 100644 --- a/libclc/opencl/include/clc/opencl/math/pow.h +++ b/libclc/opencl/include/clc/opencl/math/pow.h @@ -9,9 +9,9 @@ #ifndef __CLC_OPENCL_MATH_POW_H__ #define __CLC_OPENCL_MATH_POW_H__ -#define FUNCTION pow +#define __CLC_FUNCTION pow #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_POW_H__ diff --git a/libclc/opencl/include/clc/opencl/math/pown.h b/libclc/opencl/include/clc/opencl/math/pown.h index fd97a705c8c8..24fc298da4f1 100644 --- a/libclc/opencl/include/clc/opencl/math/pown.h +++ b/libclc/opencl/include/clc/opencl/math/pown.h @@ -9,11 +9,11 @@ #ifndef __CLC_OPENCL_MATH_POWN_H__ #define __CLC_OPENCL_MATH_POWN_H__ -#define FUNCTION pown +#define __CLC_FUNCTION pown #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_POWN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/powr.h b/libclc/opencl/include/clc/opencl/math/powr.h index 47f653a73afa..6aebc346080f 100644 --- a/libclc/opencl/include/clc/opencl/math/powr.h +++ b/libclc/opencl/include/clc/opencl/math/powr.h @@ -9,9 +9,9 @@ #ifndef __CLC_OPENCL_MATH_POWR_H__ #define __CLC_OPENCL_MATH_POWR_H__ -#define FUNCTION powr +#define __CLC_FUNCTION powr #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_POWR_H__ diff --git a/libclc/opencl/include/clc/opencl/math/remainder.h b/libclc/opencl/include/clc/opencl/math/remainder.h index 416a0422c9a6..d30a315a4ce3 100644 --- a/libclc/opencl/include/clc/opencl/math/remainder.h +++ b/libclc/opencl/include/clc/opencl/math/remainder.h @@ -9,9 +9,9 @@ #ifndef __CLC_OPENCL_MATH_REMAINDER_H__ #define __CLC_OPENCL_MATH_REMAINDER_H__ -#define FUNCTION remainder +#define __CLC_FUNCTION remainder #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_REMAINDER_H__ diff --git a/libclc/opencl/include/clc/opencl/math/remquo.h b/libclc/opencl/include/clc/opencl/math/remquo.h index 3525c6bd0f68..28fc74c00fd9 100644 --- a/libclc/opencl/include/clc/opencl/math/remquo.h +++ b/libclc/opencl/include/clc/opencl/math/remquo.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_MATH_REMQUO_H__ #define __CLC_OPENCL_MATH_REMQUO_H__ -#define FUNCTION remquo +#define __CLC_FUNCTION remquo #define __CLC_BODY #include @@ -21,6 +21,6 @@ #undef __CLC_ADDRESS_SPACE #endif -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_REMQUO_H__ diff --git a/libclc/opencl/include/clc/opencl/math/rint.h b/libclc/opencl/include/clc/opencl/math/rint.h index 8f08fd20c3c9..ceca56535a86 100644 --- a/libclc/opencl/include/clc/opencl/math/rint.h +++ b/libclc/opencl/include/clc/opencl/math/rint.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_RINT_H__ #define __CLC_BODY -#define FUNCTION rint +#define __CLC_FUNCTION rint #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_RINT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/rootn.h b/libclc/opencl/include/clc/opencl/math/rootn.h index 20b52e7c01dc..9dce895a3f2d 100644 --- a/libclc/opencl/include/clc/opencl/math/rootn.h +++ b/libclc/opencl/include/clc/opencl/math/rootn.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ROOTN_H__ #define __CLC_BODY -#define FUNCTION rootn +#define __CLC_FUNCTION rootn #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ROOTN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/round.h b/libclc/opencl/include/clc/opencl/math/round.h index 65bfc265ced0..740ae9241f69 100644 --- a/libclc/opencl/include/clc/opencl/math/round.h +++ b/libclc/opencl/include/clc/opencl/math/round.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_ROUND_H__ #define __CLC_BODY -#define FUNCTION round +#define __CLC_FUNCTION round #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_ROUND_H__ diff --git a/libclc/opencl/include/clc/opencl/math/rsqrt.h b/libclc/opencl/include/clc/opencl/math/rsqrt.h index 41fb54ea9bb5..ea141d4bc45b 100644 --- a/libclc/opencl/include/clc/opencl/math/rsqrt.h +++ b/libclc/opencl/include/clc/opencl/math/rsqrt.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_RSQRT_H__ #define __CLC_BODY -#define FUNCTION rsqrt +#define __CLC_FUNCTION rsqrt #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_RSQRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/sin.h b/libclc/opencl/include/clc/opencl/math/sin.h index fc7741e164fd..60de98de0958 100644 --- a/libclc/opencl/include/clc/opencl/math/sin.h +++ b/libclc/opencl/include/clc/opencl/math/sin.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_SIN_H__ #define __CLC_BODY -#define FUNCTION sin +#define __CLC_FUNCTION sin #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_SIN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/sincos.h b/libclc/opencl/include/clc/opencl/math/sincos.h index 0231560a6b9d..4989bcf21c03 100644 --- a/libclc/opencl/include/clc/opencl/math/sincos.h +++ b/libclc/opencl/include/clc/opencl/math/sincos.h @@ -10,8 +10,8 @@ #define __CLC_OPENCL_MATH_SINCOS_H__ #define __CLC_BODY -#define FUNCTION sincos +#define __CLC_FUNCTION sincos #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_SINCOS_H__ diff --git a/libclc/opencl/include/clc/opencl/math/sinh.h b/libclc/opencl/include/clc/opencl/math/sinh.h index cbe30551ec64..0c9f0985be52 100644 --- a/libclc/opencl/include/clc/opencl/math/sinh.h +++ b/libclc/opencl/include/clc/opencl/math/sinh.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_SINH_H__ #define __CLC_BODY -#define FUNCTION sinh +#define __CLC_FUNCTION sinh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_SINH_H__ diff --git a/libclc/opencl/include/clc/opencl/math/sinpi.h b/libclc/opencl/include/clc/opencl/math/sinpi.h index 3ef55a1cd42d..a24339f8cf03 100644 --- a/libclc/opencl/include/clc/opencl/math/sinpi.h +++ b/libclc/opencl/include/clc/opencl/math/sinpi.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_SINPI_H__ #define __CLC_BODY -#define FUNCTION sinpi +#define __CLC_FUNCTION sinpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_SINPI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/sqrt.h b/libclc/opencl/include/clc/opencl/math/sqrt.h index 8da37e94e756..41f604f9652d 100644 --- a/libclc/opencl/include/clc/opencl/math/sqrt.h +++ b/libclc/opencl/include/clc/opencl/math/sqrt.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_SQRT_H__ #define __CLC_BODY -#define FUNCTION sqrt +#define __CLC_FUNCTION sqrt #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_SQRT_H__ diff --git a/libclc/opencl/include/clc/opencl/math/tan.h b/libclc/opencl/include/clc/opencl/math/tan.h index b7bab825c935..5c3f5f6a3a41 100644 --- a/libclc/opencl/include/clc/opencl/math/tan.h +++ b/libclc/opencl/include/clc/opencl/math/tan.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_TAN_H__ #define __CLC_BODY -#define FUNCTION tan +#define __CLC_FUNCTION tan #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_TAN_H__ diff --git a/libclc/opencl/include/clc/opencl/math/tanh.h b/libclc/opencl/include/clc/opencl/math/tanh.h index 168c06d50cb2..dce89867f25e 100644 --- a/libclc/opencl/include/clc/opencl/math/tanh.h +++ b/libclc/opencl/include/clc/opencl/math/tanh.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_TANH_H__ #define __CLC_BODY -#define FUNCTION tanh +#define __CLC_FUNCTION tanh #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_TANH_H__ diff --git a/libclc/opencl/include/clc/opencl/math/tanpi.h b/libclc/opencl/include/clc/opencl/math/tanpi.h index 0475027086f6..2a903b9a1fe7 100644 --- a/libclc/opencl/include/clc/opencl/math/tanpi.h +++ b/libclc/opencl/include/clc/opencl/math/tanpi.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_TANPI_H__ #define __CLC_BODY -#define FUNCTION tanpi +#define __CLC_FUNCTION tanpi #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_TANPI_H__ diff --git a/libclc/opencl/include/clc/opencl/math/tgamma.h b/libclc/opencl/include/clc/opencl/math/tgamma.h index 6629c0200245..5cb0a7fd48f9 100644 --- a/libclc/opencl/include/clc/opencl/math/tgamma.h +++ b/libclc/opencl/include/clc/opencl/math/tgamma.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_TGAMMA_H__ #define __CLC_BODY -#define FUNCTION tgamma +#define __CLC_FUNCTION tgamma #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_TGAMMA_H__ diff --git a/libclc/opencl/include/clc/opencl/math/trunc.h b/libclc/opencl/include/clc/opencl/math/trunc.h index 0e61f6e1679f..ccb60a6a54d6 100644 --- a/libclc/opencl/include/clc/opencl/math/trunc.h +++ b/libclc/opencl/include/clc/opencl/math/trunc.h @@ -10,10 +10,10 @@ #define __CLC_OPENCL_MATH_TRUNC_H__ #define __CLC_BODY -#define FUNCTION trunc +#define __CLC_FUNCTION trunc #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MATH_TRUNC_H__ diff --git a/libclc/opencl/include/clc/opencl/misc/shuffle.h b/libclc/opencl/include/clc/opencl/misc/shuffle.h index 03c2718b0c4f..a1d4b6aa3824 100644 --- a/libclc/opencl/include/clc/opencl/misc/shuffle.h +++ b/libclc/opencl/include/clc/opencl/misc/shuffle.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_MISC_SHUFFLE_H__ #define __CLC_OPENCL_MISC_SHUFFLE_H__ -#define FUNCTION shuffle +#define __CLC_FUNCTION shuffle // Integer-type decls #define __CLC_BODY @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MISC_SHUFFLE_H__ diff --git a/libclc/opencl/include/clc/opencl/misc/shuffle2.h b/libclc/opencl/include/clc/opencl/misc/shuffle2.h index 5b6ba9968651..0f93a802b3f3 100644 --- a/libclc/opencl/include/clc/opencl/misc/shuffle2.h +++ b/libclc/opencl/include/clc/opencl/misc/shuffle2.h @@ -9,7 +9,7 @@ #ifndef __CLC_OPENCL_MISC_SHUFFLE2_H__ #define __CLC_OPENCL_MISC_SHUFFLE2_H__ -#define FUNCTION shuffle2 +#define __CLC_FUNCTION shuffle2 // Integer-type decls #define __CLC_BODY @@ -19,6 +19,6 @@ #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_MISC_SHUFFLE2_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isfinite.h b/libclc/opencl/include/clc/opencl/relational/isfinite.h index ac3db6764073..ff996befa937 100644 --- a/libclc/opencl/include/clc/opencl/relational/isfinite.h +++ b/libclc/opencl/include/clc/opencl/relational/isfinite.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isfinite +#define __CLC_FUNCTION isfinite #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISFINITE_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isgreater.h b/libclc/opencl/include/clc/opencl/relational/isgreater.h index 2230055115bc..1885faac0d6d 100644 --- a/libclc/opencl/include/clc/opencl/relational/isgreater.h +++ b/libclc/opencl/include/clc/opencl/relational/isgreater.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isgreater +#define __CLC_FUNCTION isgreater #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISGREATER_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h index f99a620dabd7..36def480081b 100644 --- a/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h +++ b/libclc/opencl/include/clc/opencl/relational/isgreaterequal.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isgreaterequal +#define __CLC_FUNCTION isgreaterequal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISGREATEREQUAL_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isless.h b/libclc/opencl/include/clc/opencl/relational/isless.h index 74280e543e0b..ce516890f19e 100644 --- a/libclc/opencl/include/clc/opencl/relational/isless.h +++ b/libclc/opencl/include/clc/opencl/relational/isless.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isless +#define __CLC_FUNCTION isless #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISLESS_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/islessequal.h b/libclc/opencl/include/clc/opencl/relational/islessequal.h index dcc26c37b73c..571ed13c0ab8 100644 --- a/libclc/opencl/include/clc/opencl/relational/islessequal.h +++ b/libclc/opencl/include/clc/opencl/relational/islessequal.h @@ -11,11 +11,11 @@ #include -#define FUNCTION islessequal +#define __CLC_FUNCTION islessequal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISLESSEQUAL_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/islessgreater.h b/libclc/opencl/include/clc/opencl/relational/islessgreater.h index 15a1eb557753..eefd2c41b346 100644 --- a/libclc/opencl/include/clc/opencl/relational/islessgreater.h +++ b/libclc/opencl/include/clc/opencl/relational/islessgreater.h @@ -11,11 +11,11 @@ #include -#define FUNCTION islessgreater +#define __CLC_FUNCTION islessgreater #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISLESSGREATER_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isnormal.h b/libclc/opencl/include/clc/opencl/relational/isnormal.h index bbb06aad0df2..29ef762174f0 100644 --- a/libclc/opencl/include/clc/opencl/relational/isnormal.h +++ b/libclc/opencl/include/clc/opencl/relational/isnormal.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isnormal +#define __CLC_FUNCTION isnormal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISNORMAL_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isnotequal.h b/libclc/opencl/include/clc/opencl/relational/isnotequal.h index c13aca8ef4be..6670b9946d18 100644 --- a/libclc/opencl/include/clc/opencl/relational/isnotequal.h +++ b/libclc/opencl/include/clc/opencl/relational/isnotequal.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isnotequal +#define __CLC_FUNCTION isnotequal #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISNOTEQUAL_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isordered.h b/libclc/opencl/include/clc/opencl/relational/isordered.h index ea4ba3fa6fe8..09531ed86721 100644 --- a/libclc/opencl/include/clc/opencl/relational/isordered.h +++ b/libclc/opencl/include/clc/opencl/relational/isordered.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isordered +#define __CLC_FUNCTION isordered #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISORDERED_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/isunordered.h b/libclc/opencl/include/clc/opencl/relational/isunordered.h index 76bf85604d1c..d1f6dabbe20c 100644 --- a/libclc/opencl/include/clc/opencl/relational/isunordered.h +++ b/libclc/opencl/include/clc/opencl/relational/isunordered.h @@ -11,11 +11,11 @@ #include -#define FUNCTION isunordered +#define __CLC_FUNCTION isunordered #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_ISUNORDERED_H__ diff --git a/libclc/opencl/include/clc/opencl/relational/signbit.h b/libclc/opencl/include/clc/opencl/relational/signbit.h index 6ad6595c7e29..fff6f33508fa 100644 --- a/libclc/opencl/include/clc/opencl/relational/signbit.h +++ b/libclc/opencl/include/clc/opencl/relational/signbit.h @@ -11,11 +11,11 @@ #include -#define FUNCTION signbit +#define __CLC_FUNCTION signbit #define __CLC_BODY #include -#undef FUNCTION +#undef __CLC_FUNCTION #endif // __CLC_OPENCL_RELATIONAL_SIGNBIT_H__ diff --git a/libclc/opencl/lib/clspv/math/fma.cl b/libclc/opencl/lib/clspv/math/fma.cl index 83504f63772c..0e328903ba26 100644 --- a/libclc/opencl/lib/clspv/math/fma.cl +++ b/libclc/opencl/lib/clspv/math/fma.cl @@ -9,9 +9,9 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION fma -#define __IMPL_FUNCTION(x) __clc_sw_fma +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fma +#define __CLC_IMPL_FUNCTION(x) __clc_sw_fma #define __CLC_BODY #include diff --git a/libclc/opencl/lib/clspv/shared/vstore_half.cl b/libclc/opencl/lib/clspv/shared/vstore_half.cl index df5f30a711df..66973eb9ac4d 100644 --- a/libclc/opencl/lib/clspv/shared/vstore_half.cl +++ b/libclc/opencl/lib/clspv/shared/vstore_half.cl @@ -18,37 +18,37 @@ #pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable -#define ROUND_VEC1(out, in, ROUNDF) out = ROUNDF(in); -#define ROUND_VEC2(out, in, ROUNDF) \ - ROUND_VEC1(out.lo, in.lo, ROUNDF); \ - ROUND_VEC1(out.hi, in.hi, ROUNDF); -#define ROUND_VEC3(out, in, ROUNDF) \ - ROUND_VEC1(out.s0, in.s0, ROUNDF); \ - ROUND_VEC1(out.s1, in.s1, ROUNDF); \ - ROUND_VEC1(out.s2, in.s2, ROUNDF); -#define ROUND_VEC4(out, in, ROUNDF) \ - ROUND_VEC2(out.lo, in.lo, ROUNDF); \ - ROUND_VEC2(out.hi, in.hi, ROUNDF); -#define ROUND_VEC8(out, in, ROUNDF) \ - ROUND_VEC4(out.lo, in.lo, ROUNDF); \ - ROUND_VEC4(out.hi, in.hi, ROUNDF); -#define ROUND_VEC16(out, in, ROUNDF) \ - ROUND_VEC8(out.lo, in.lo, ROUNDF); \ - ROUND_VEC8(out.hi, in.hi, ROUNDF); +#define __CLC_ROUND_VEC1(out, in, ROUNDF) out = ROUNDF(in); +#define __CLC_ROUND_VEC2(out, in, ROUNDF) \ + __CLC_ROUND_VEC1(out.lo, in.lo, ROUNDF); \ + __CLC_ROUND_VEC1(out.hi, in.hi, ROUNDF); +#define __CLC_ROUND_VEC3(out, in, ROUNDF) \ + __CLC_ROUND_VEC1(out.s0, in.s0, ROUNDF); \ + __CLC_ROUND_VEC1(out.s1, in.s1, ROUNDF); \ + __CLC_ROUND_VEC1(out.s2, in.s2, ROUNDF); +#define __CLC_ROUND_VEC4(out, in, ROUNDF) \ + __CLC_ROUND_VEC2(out.lo, in.lo, ROUNDF); \ + __CLC_ROUND_VEC2(out.hi, in.hi, ROUNDF); +#define __CLC_ROUND_VEC8(out, in, ROUNDF) \ + __CLC_ROUND_VEC4(out.lo, in.lo, ROUNDF); \ + __CLC_ROUND_VEC4(out.hi, in.hi, ROUNDF); +#define __CLC_ROUND_VEC16(out, in, ROUNDF) \ + __CLC_ROUND_VEC8(out.lo, in.lo, ROUNDF); \ + __CLC_ROUND_VEC8(out.hi, in.hi, ROUNDF); -#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, ROUNDF) \ +#define __CLC_XFUNC_IMPL(SUFFIX, VEC_SIZE, TYPE, AS, ROUNDF) \ void _CLC_OVERLOAD vstore_half_##VEC_SIZE(TYPE, size_t, AS half *); \ _CLC_OVERLOAD _CLC_DEF void vstore_half##SUFFIX(TYPE vec, size_t offset, \ AS half *mem) { \ TYPE rounded_vec; \ - ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \ + __CLC_ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \ vstore_half_##VEC_SIZE(rounded_vec, offset, mem); \ } \ void _CLC_OVERLOAD vstorea_half_##VEC_SIZE(TYPE, size_t, AS half *); \ _CLC_OVERLOAD _CLC_DEF void vstorea_half##SUFFIX(TYPE vec, size_t offset, \ AS half *mem) { \ TYPE rounded_vec; \ - ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \ + __CLC_ROUND_VEC##VEC_SIZE(rounded_vec, vec, ROUNDF); \ vstorea_half_##VEC_SIZE(rounded_vec, offset, mem); \ } @@ -134,17 +134,18 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rte(float x) { return roundup ? __clc_rti(x) : __clc_rtz(x); } -#define __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) \ - __FUNC(SUFFIX, VEC_SIZE, TYPE, AS, __clc_rte) \ - __FUNC(SUFFIX##_rtz, VEC_SIZE, TYPE, AS, __clc_rtz) \ - __FUNC(SUFFIX##_rtn, VEC_SIZE, TYPE, AS, __clc_rtn) \ - __FUNC(SUFFIX##_rtp, VEC_SIZE, TYPE, AS, __clc_rtp) \ - __FUNC(SUFFIX##_rte, VEC_SIZE, TYPE, AS, __clc_rte) +#define __CLC_XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) \ + __CLC_XFUNC_IMPL(SUFFIX, VEC_SIZE, TYPE, AS, __clc_rte) \ + __CLC_XFUNC_IMPL(SUFFIX##_rtz, VEC_SIZE, TYPE, AS, __clc_rtz) \ + __CLC_XFUNC_IMPL(SUFFIX##_rtn, VEC_SIZE, TYPE, AS, __clc_rtn) \ + __CLC_XFUNC_IMPL(SUFFIX##_rtp, VEC_SIZE, TYPE, AS, __clc_rtp) \ + __CLC_XFUNC_IMPL(SUFFIX##_rte, VEC_SIZE, TYPE, AS, __clc_rte) -#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) +#define __CLC_FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \ + __CLC_XFUNC(SUFFIX, VEC_SIZE, TYPE, AS) #define __CLC_BODY "vstore_half.inc" #include -#undef FUNC -#undef __XFUNC -#undef __FUNC +#undef __CLC_FUNC +#undef __CLC_XFUNC +#undef __CLC_XFUNC_IMPL diff --git a/libclc/opencl/lib/clspv/shared/vstore_half.inc b/libclc/opencl/lib/clspv/shared/vstore_half.inc index f2c02748246a..d78fbb80976b 100644 --- a/libclc/opencl/lib/clspv/shared/vstore_half.inc +++ b/libclc/opencl/lib/clspv/shared/vstore_half.inc @@ -10,20 +10,20 @@ #if __CLC_FPSIZE == 32 #ifndef __CLC_SCALAR -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local); -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __generic); +__CLC_FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __generic); #endif #undef __CLC_OFFSET #else -FUNC(, 1, __CLC_GENTYPE, __private); -FUNC(, 1, __CLC_GENTYPE, __local); -FUNC(, 1, __CLC_GENTYPE, __global); +__CLC_FUNC(, 1, __CLC_GENTYPE, __private); +__CLC_FUNC(, 1, __CLC_GENTYPE, __local); +__CLC_FUNC(, 1, __CLC_GENTYPE, __global); #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -FUNC(, 1, __CLC_GENTYPE, __generic); +__CLC_FUNC(, 1, __CLC_GENTYPE, __generic); #endif #endif #endif diff --git a/libclc/opencl/lib/generic/async/async_work_group_strided_copy.inc b/libclc/opencl/lib/generic/async/async_work_group_strided_copy.inc index c9c32a7d9431..f421298e2ca0 100644 --- a/libclc/opencl/lib/generic/async/async_work_group_strided_copy.inc +++ b/libclc/opencl/lib/generic/async/async_work_group_strided_copy.inc @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#define STRIDED_COPY(dst, src, num_gentypes, dst_stride, src_stride) \ +#define __CLC_STRIDED_COPY(dst, src, num_gentypes, dst_stride, src_stride) \ size_t size = get_local_size(0) * get_local_size(1) * get_local_size(2); \ size_t id = (get_local_size(0) * get_local_size(1) * get_local_id(2)) + \ (get_local_size(0) * get_local_id(1)) + get_local_id(0); \ @@ -20,7 +20,7 @@ _CLC_OVERLOAD _CLC_DEF event_t async_work_group_strided_copy( local __CLC_GENTYPE *dst, const global __CLC_GENTYPE *src, size_t num_gentypes, size_t src_stride, event_t event) { - STRIDED_COPY(dst, src, num_gentypes, 1, src_stride); + __CLC_STRIDED_COPY(dst, src, num_gentypes, 1, src_stride); return event; } @@ -28,6 +28,6 @@ _CLC_OVERLOAD _CLC_DEF event_t async_work_group_strided_copy( global __CLC_GENTYPE *dst, const local __CLC_GENTYPE *src, size_t num_gentypes, size_t dst_stride, event_t event) { - STRIDED_COPY(dst, src, num_gentypes, dst_stride, 1); + __CLC_STRIDED_COPY(dst, src, num_gentypes, dst_stride, 1); return event; } diff --git a/libclc/opencl/lib/generic/atomic/atom_add.cl b/libclc/opencl/lib/generic/atomic/atom_add.cl index 65d7ad595ace..08fb3fecd5bc 100644 --- a/libclc/opencl/lib/generic/atomic/atom_add.cl +++ b/libclc/opencl/lib/generic/atomic/atom_add.cl @@ -23,15 +23,15 @@ #ifdef cl_khr_int64_base_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_add(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_add_8(p, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_and.cl b/libclc/opencl/lib/generic/atomic/atom_and.cl index 371e5f0aa4d0..1dddd8e72f30 100644 --- a/libclc/opencl/lib/generic/atomic/atom_and.cl +++ b/libclc/opencl/lib/generic/atomic/atom_and.cl @@ -23,15 +23,15 @@ #ifdef cl_khr_int64_extended_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_and(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_and_8(p, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl b/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl index f129be9809e5..5ae6aa30a835 100644 --- a/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl +++ b/libclc/opencl/lib/generic/atomic/atom_cmpxchg.cl @@ -9,35 +9,35 @@ #include #include -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(volatile AS TYPE *p, TYPE cmp, \ TYPE val) { \ return atomic_cmpxchg(p, cmp, val); \ } #ifdef cl_khr_global_int32_base_atomics -IMPL(global, int) -IMPL(global, unsigned int) +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -IMPL(local, int) -IMPL(local, unsigned int) +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics -#undef IMPL +#undef __CLC_IMPL #ifdef cl_khr_int64_base_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_cmpxchg(volatile AS TYPE *p, TYPE cmp, \ TYPE val) { \ return __sync_val_compare_and_swap_8(p, cmp, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_dec.cl b/libclc/opencl/lib/generic/atomic/atom_dec.cl index 2df721cae365..af811042d307 100644 --- a/libclc/opencl/lib/generic/atomic/atom_dec.cl +++ b/libclc/opencl/lib/generic/atomic/atom_dec.cl @@ -10,33 +10,33 @@ #include #include -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(volatile AS TYPE *p) { \ return atomic_dec(p); \ } #ifdef cl_khr_global_int32_base_atomics -IMPL(global, int) -IMPL(global, unsigned int) +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -IMPL(local, int) -IMPL(local, unsigned int) +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics -#undef IMPL +#undef __CLC_IMPL #ifdef cl_khr_int64_base_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_dec(volatile AS TYPE *p) { \ return atom_sub(p, (TYPE)1); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_inc.cl b/libclc/opencl/lib/generic/atomic/atom_inc.cl index dc7699a47802..f881b3a3caa6 100644 --- a/libclc/opencl/lib/generic/atomic/atom_inc.cl +++ b/libclc/opencl/lib/generic/atomic/atom_inc.cl @@ -10,33 +10,33 @@ #include #include -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(volatile AS TYPE *p) { \ return atomic_inc(p); \ } #ifdef cl_khr_global_int32_base_atomics -IMPL(global, int) -IMPL(global, unsigned int) +__CLC_IMPL(global, int) +__CLC_IMPL(global, unsigned int) #endif // cl_khr_global_int32_base_atomics #ifdef cl_khr_local_int32_base_atomics -IMPL(local, int) -IMPL(local, unsigned int) +__CLC_IMPL(local, int) +__CLC_IMPL(local, unsigned int) #endif // cl_khr_local_int32_base_atomics -#undef IMPL +#undef __CLC_IMPL #ifdef cl_khr_int64_base_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_inc(volatile AS TYPE *p) { \ return atom_add(p, (TYPE)1); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_max.cl b/libclc/opencl/lib/generic/atomic/atom_max.cl index 2542191d04f5..83b532ac19a1 100644 --- a/libclc/opencl/lib/generic/atomic/atom_max.cl +++ b/libclc/opencl/lib/generic/atomic/atom_max.cl @@ -31,15 +31,15 @@ unsigned long __clc__sync_fetch_and_umax_global_8(volatile global unsigned long *, unsigned long); -#define IMPL(AS, TYPE, OP) \ +#define __CLC_IMPL(AS, TYPE, OP) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_max(volatile AS TYPE *p, TYPE val) { \ return __clc__sync_fetch_and_##OP##_##AS##_8(p, val); \ } -IMPL(global, long, max) -IMPL(global, unsigned long, umax) -IMPL(local, long, max) -IMPL(local, unsigned long, umax) -#undef IMPL +__CLC_IMPL(global, long, max) +__CLC_IMPL(global, unsigned long, umax) +__CLC_IMPL(local, long, max) +__CLC_IMPL(local, unsigned long, umax) +#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_min.cl b/libclc/opencl/lib/generic/atomic/atom_min.cl index 4e62804824d8..b52e34769cdd 100644 --- a/libclc/opencl/lib/generic/atomic/atom_min.cl +++ b/libclc/opencl/lib/generic/atomic/atom_min.cl @@ -31,15 +31,15 @@ unsigned long __clc__sync_fetch_and_umin_global_8(volatile global unsigned long *, unsigned long); -#define IMPL(AS, TYPE, OP) \ +#define __CLC_IMPL(AS, TYPE, OP) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_min(volatile AS TYPE *p, TYPE val) { \ return __clc__sync_fetch_and_##OP##_##AS##_8(p, val); \ } -IMPL(global, long, min) -IMPL(global, unsigned long, umin) -IMPL(local, long, min) -IMPL(local, unsigned long, umin) -#undef IMPL +__CLC_IMPL(global, long, min) +__CLC_IMPL(global, unsigned long, umin) +__CLC_IMPL(local, long, min) +__CLC_IMPL(local, unsigned long, umin) +#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_or.cl b/libclc/opencl/lib/generic/atomic/atom_or.cl index 30ad8e52ca23..fa9737f5f28e 100644 --- a/libclc/opencl/lib/generic/atomic/atom_or.cl +++ b/libclc/opencl/lib/generic/atomic/atom_or.cl @@ -23,15 +23,15 @@ #ifdef cl_khr_int64_extended_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_or(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_or_8(p, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_sub.cl b/libclc/opencl/lib/generic/atomic/atom_sub.cl index f6ee94db7469..9a8acfa9116b 100644 --- a/libclc/opencl/lib/generic/atomic/atom_sub.cl +++ b/libclc/opencl/lib/generic/atomic/atom_sub.cl @@ -23,15 +23,15 @@ #ifdef cl_khr_int64_base_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_sub(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_sub_8(p, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_xchg.cl b/libclc/opencl/lib/generic/atomic/atom_xchg.cl index 3fa17c950a82..03f8a9c466c5 100644 --- a/libclc/opencl/lib/generic/atomic/atom_xchg.cl +++ b/libclc/opencl/lib/generic/atomic/atom_xchg.cl @@ -23,15 +23,15 @@ #ifdef cl_khr_int64_base_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_xchg(volatile AS TYPE *p, TYPE val) { \ return __sync_swap_8(p, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_base_atomics diff --git a/libclc/opencl/lib/generic/atomic/atom_xor.cl b/libclc/opencl/lib/generic/atomic/atom_xor.cl index ac08ef63b1a4..392a4b794c69 100644 --- a/libclc/opencl/lib/generic/atomic/atom_xor.cl +++ b/libclc/opencl/lib/generic/atomic/atom_xor.cl @@ -23,15 +23,15 @@ #ifdef cl_khr_int64_extended_atomics -#define IMPL(AS, TYPE) \ +#define __CLC_IMPL(AS, TYPE) \ _CLC_OVERLOAD _CLC_DEF TYPE atom_xor(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_xor_8(p, val); \ } -IMPL(global, long) -IMPL(global, unsigned long) -IMPL(local, long) -IMPL(local, unsigned long) -#undef IMPL +__CLC_IMPL(global, long) +__CLC_IMPL(global, unsigned long) +__CLC_IMPL(local, long) +__CLC_IMPL(local, unsigned long) +#undef __CLC_IMPL #endif // cl_khr_int64_extended_atomics diff --git a/libclc/opencl/lib/generic/atomic/atomic_add.cl b/libclc/opencl/lib/generic/atomic/atomic_add.cl index a0effced7dc6..d005c1dd6ac5 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_add.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_add.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_add(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_add(p, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_and.cl b/libclc/opencl/lib/generic/atomic/atomic_and.cl index 629e6638d3bc..12558568b0e4 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_and.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_and.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_and(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_and(p, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl b/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl index db0495b004c1..1045020a553f 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_cmpxchg.cl @@ -8,14 +8,14 @@ #include -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_cmpxchg(volatile AS TYPE *p, TYPE cmp, \ TYPE val) { \ return __sync_val_compare_and_swap(p, cmp, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl index 422c03f29207..2c1f07d8ca48 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_strong.cl @@ -12,7 +12,7 @@ #include #include -#define FUNCTION atomic_compare_exchange_strong +#define __CLC_FUNCTION atomic_compare_exchange_strong #define __CLC_COMPARE_EXCHANGE #define __CLC_BODY diff --git a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl index 8a6b3c4f0110..69bdf37250f7 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_compare_exchange_weak.cl @@ -12,7 +12,7 @@ #include #include -#define FUNCTION atomic_compare_exchange_weak +#define __CLC_FUNCTION atomic_compare_exchange_weak #define __CLC_COMPARE_EXCHANGE #define __CLC_BODY diff --git a/libclc/opencl/lib/generic/atomic/atomic_dec.cl b/libclc/opencl/lib/generic/atomic/atomic_dec.cl index 6de55bc0b984..ee3826772336 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_dec.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_dec.cl @@ -9,8 +9,8 @@ #include #include -#define FUNCTION atomic_dec -#define __IMPL_FUNCTION __clc_atomic_dec +#define __CLC_FUNCTION atomic_dec +#define __CLC_IMPL_FUNCTION __clc_atomic_dec #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_def.inc b/libclc/opencl/lib/generic/atomic/atomic_def.inc index ce192bf84493..a4ccab599088 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_def.inc +++ b/libclc/opencl/lib/generic/atomic/atomic_def.inc @@ -10,37 +10,37 @@ #if defined(__opencl_c_fp64) && (defined(cl_khr_int64_base_atomics) && \ defined(cl_khr_int64_extended_atomics)) -#define HAVE_64_ATOMIC +#define __CLC_HAVE_64_ATOMIC #endif -#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(HAVE_64_ATOMIC) -#define HAVE_FP_ATOMIC +#if defined(__CLC_FPSIZE) && (__CLC_FPSIZE < 64 || defined(__CLC_HAVE_64_ATOMIC) +#define __CLC_HAVE_FP_ATOMIC #endif #if defined(__CLC_GENSIZE) && \ ((__CLC_GENSIZE == 32) || \ - (__CLC_GENSIZE == 64 && defined(HAVE_64_ATOMIC))) -#define HAVE_INT_ATOMIC + (__CLC_GENSIZE == 64 && defined(__CLC_HAVE_64_ATOMIC))) +#define __CLC_HAVE_INT_ATOMIC #endif -#if defined(HAVE_FP_ATOMIC) || defined(HAVE_INT_ATOMIC) +#if defined(__CLC_HAVE_FP_ATOMIC) || defined(__CLC_HAVE_INT_ATOMIC) #define __CLC_ATOMIC_GENTYPE __CLC_XCONCAT(atomic_, __CLC_GENTYPE) #ifdef __CLC_NO_VALUE_ARG #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr) { \ - return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, \ - __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + return __CLC_IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, \ + __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ } #elif defined(__CLC_RETURN_VOID) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF void FUNCTION( \ + _CLC_OVERLOAD _CLC_DEF void __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) { \ - __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value, \ - __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + __CLC_IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value, \ + __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ } #elif defined(__CLC_COMPARE_EXCHANGE) #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, \ ADDRSPACE __CLC_GENTYPE *Expected, __CLC_GENTYPE Desired) { \ __CLC_GENTYPE Comparator = *Expected; \ @@ -55,10 +55,10 @@ } #else #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_ATOMIC_GENTYPE *Ptr, __CLC_GENTYPE Value) { \ - return __IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value, \ - __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + return __CLC_IMPL_FUNCTION((volatile ADDRSPACE __CLC_GENTYPE *)Ptr, Value, \ + __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ } #endif @@ -70,10 +70,10 @@ __CLC_DEFINE_ATOMIC() #undef __CLC_DEFINE_ATOMIC -#endif // HAVE_FP_ATOMIC || HAVE_INT_ATOMIC +#endif // __CLC_HAVE_FP_ATOMIC || __CLC_HAVE_INT_ATOMIC -#undef HAVE_INT_ATOMIC -#undef HAVE_FP_ATOMIC -#undef HAVE_64_ATOMIC +#undef __CLC_HAVE_INT_ATOMIC +#undef __CLC_HAVE_FP_ATOMIC +#undef __CLC_HAVE_64_ATOMIC #endif // __CLC_SCALAR diff --git a/libclc/opencl/lib/generic/atomic/atomic_exchange.cl b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl index 6dae6c0a7759..5f7e2fa593e3 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_exchange.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_exchange.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_exchange -#define __IMPL_FUNCTION __clc_atomic_exchange +#define __CLC_FUNCTION atomic_exchange +#define __CLC_IMPL_FUNCTION __clc_atomic_exchange #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl index bbaa1c2b0dac..0362ff89d1d7 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_add.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_add -#define __IMPL_FUNCTION __clc_atomic_fetch_add +#define __CLC_FUNCTION atomic_fetch_add +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_add #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl index 73925844c935..a1796f20c6e4 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_and.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_and -#define __IMPL_FUNCTION __clc_atomic_fetch_and +#define __CLC_FUNCTION atomic_fetch_and +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_and #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl index 8c8ce11cc575..03b5d1d8ae7b 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_max.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_max -#define __IMPL_FUNCTION __clc_atomic_fetch_max +#define __CLC_FUNCTION atomic_fetch_max +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_max #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl index 550459cee32d..60ffeff04cc6 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_min.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_min -#define __IMPL_FUNCTION __clc_atomic_fetch_min +#define __CLC_FUNCTION atomic_fetch_min +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_min #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl index 2606ff3c9967..8f4100bb150e 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_or.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_or -#define __IMPL_FUNCTION __clc_atomic_fetch_or +#define __CLC_FUNCTION atomic_fetch_or +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_or #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl index 33772233bebe..ecb5b4315ee8 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_sub.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_sub -#define __IMPL_FUNCTION __clc_atomic_fetch_sub +#define __CLC_FUNCTION atomic_fetch_sub +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_sub #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl index 6f6503e588b6..c49a55820c8d 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_fetch_xor.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_fetch_xor -#define __IMPL_FUNCTION __clc_atomic_fetch_xor +#define __CLC_FUNCTION atomic_fetch_xor +#define __CLC_IMPL_FUNCTION __clc_atomic_fetch_xor #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc.cl b/libclc/opencl/lib/generic/atomic/atomic_inc.cl index a160b2e2370f..7cf82b25a0d4 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_inc.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_inc.cl @@ -9,8 +9,8 @@ #include #include -#define FUNCTION atomic_inc -#define __IMPL_FUNCTION __clc_atomic_inc +#define __CLC_FUNCTION atomic_inc +#define __CLC_IMPL_FUNCTION __clc_atomic_inc #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc index 0bcf300dd284..03eb5d1b3305 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc +++ b/libclc/opencl/lib/generic/atomic/atomic_inc_dec.inc @@ -11,9 +11,9 @@ #if __CLC_GENSIZE == 32 #define __CLC_DEFINE_ATOMIC(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION( \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_FUNCTION( \ volatile ADDRSPACE __CLC_GENTYPE *Ptr) { \ - return __IMPL_FUNCTION(Ptr, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ + return __CLC_IMPL_FUNCTION(Ptr, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_DEVICE); \ } __CLC_DEFINE_ATOMIC(global) diff --git a/libclc/opencl/lib/generic/atomic/atomic_load.cl b/libclc/opencl/lib/generic/atomic/atomic_load.cl index 459265473a8c..e904330be006 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_load.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_load.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_load -#define __IMPL_FUNCTION __clc_atomic_load +#define __CLC_FUNCTION atomic_load +#define __CLC_IMPL_FUNCTION __clc_atomic_load #define __CLC_NO_VALUE_ARG #define __CLC_BODY diff --git a/libclc/opencl/lib/generic/atomic/atomic_max.cl b/libclc/opencl/lib/generic/atomic/atomic_max.cl index ae929f0523dc..aa482a8f4639 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_max.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_max.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS, OP) \ +#define __CLC_IMPL(TYPE, AS, OP) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_max(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_##OP(p, val); \ } -IMPL(int, global, max) -IMPL(unsigned int, global, umax) -IMPL(int, local, max) -IMPL(unsigned int, local, umax) -#undef IMPL +__CLC_IMPL(int, global, max) +__CLC_IMPL(unsigned int, global, umax) +__CLC_IMPL(int, local, max) +__CLC_IMPL(unsigned int, local, umax) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_min.cl b/libclc/opencl/lib/generic/atomic/atomic_min.cl index c7ebe71ee829..7f39e9431684 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_min.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_min.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS, OP) \ +#define __CLC_IMPL(TYPE, AS, OP) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_min(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_##OP(p, val); \ } -IMPL(int, global, min) -IMPL(unsigned int, global, umin) -IMPL(int, local, min) -IMPL(unsigned int, local, umin) -#undef IMPL +__CLC_IMPL(int, global, min) +__CLC_IMPL(unsigned int, global, umin) +__CLC_IMPL(int, local, min) +__CLC_IMPL(unsigned int, local, umin) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_or.cl b/libclc/opencl/lib/generic/atomic/atomic_or.cl index 45fb86568965..ad14cd217855 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_or.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_or.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_or(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_or(p, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_store.cl b/libclc/opencl/lib/generic/atomic/atomic_store.cl index 67f2c8457fc1..584e29ef99a5 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_store.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_store.cl @@ -12,8 +12,8 @@ #include #include -#define FUNCTION atomic_store -#define __IMPL_FUNCTION __clc_atomic_store +#define __CLC_FUNCTION atomic_store +#define __CLC_IMPL_FUNCTION __clc_atomic_store #define __CLC_RETURN_VOID #define __CLC_BODY diff --git a/libclc/opencl/lib/generic/atomic/atomic_sub.cl b/libclc/opencl/lib/generic/atomic/atomic_sub.cl index 74977f15155f..2e51c4c2ce02 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_sub.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_sub.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_sub(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_sub(p, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_xchg.cl b/libclc/opencl/lib/generic/atomic/atomic_xchg.cl index 883132caf9fb..2585a5427392 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_xchg.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_xchg.cl @@ -17,13 +17,13 @@ _CLC_OVERLOAD _CLC_DEF float atomic_xchg(volatile local float *p, float val) { return as_float(atomic_xchg((volatile local uint *)p, as_uint(val))); } -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_xchg(volatile AS TYPE *p, TYPE val) { \ return __sync_swap_4(p, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/atomic/atomic_xor.cl b/libclc/opencl/lib/generic/atomic/atomic_xor.cl index 246752c25837..022813439746 100644 --- a/libclc/opencl/lib/generic/atomic/atomic_xor.cl +++ b/libclc/opencl/lib/generic/atomic/atomic_xor.cl @@ -8,13 +8,13 @@ #include -#define IMPL(TYPE, AS) \ +#define __CLC_IMPL(TYPE, AS) \ _CLC_OVERLOAD _CLC_DEF TYPE atomic_xor(volatile AS TYPE *p, TYPE val) { \ return __sync_fetch_and_xor(p, val); \ } -IMPL(int, global) -IMPL(unsigned int, global) -IMPL(int, local) -IMPL(unsigned int, local) -#undef IMPL +__CLC_IMPL(int, global) +__CLC_IMPL(unsigned int, global) +__CLC_IMPL(int, local) +__CLC_IMPL(unsigned int, local) +#undef __CLC_IMPL diff --git a/libclc/opencl/lib/generic/common/degrees.cl b/libclc/opencl/lib/generic/common/degrees.cl index d989fa6382ed..15b85af997b4 100644 --- a/libclc/opencl/lib/generic/common/degrees.cl +++ b/libclc/opencl/lib/generic/common/degrees.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION degrees +#define __CLC_FUNCTION degrees #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/common/radians.cl b/libclc/opencl/lib/generic/common/radians.cl index 1da38920d43f..6550cf458a7a 100644 --- a/libclc/opencl/lib/generic/common/radians.cl +++ b/libclc/opencl/lib/generic/common/radians.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION radians +#define __CLC_FUNCTION radians #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/common/sign.cl b/libclc/opencl/lib/generic/common/sign.cl index 9afcb7375e96..7add739be6a9 100644 --- a/libclc/opencl/lib/generic/common/sign.cl +++ b/libclc/opencl/lib/generic/common/sign.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION sign +#define __CLC_FUNCTION sign #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/geometric/distance.cl b/libclc/opencl/lib/generic/geometric/distance.cl index 54c78d3609ba..4d9f5403f8c2 100644 --- a/libclc/opencl/lib/generic/geometric/distance.cl +++ b/libclc/opencl/lib/generic/geometric/distance.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION distance +#define __CLC_FUNCTION distance #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/geometric/dot.cl b/libclc/opencl/lib/generic/geometric/dot.cl index 05425aafb040..fc207c8fc155 100644 --- a/libclc/opencl/lib/generic/geometric/dot.cl +++ b/libclc/opencl/lib/generic/geometric/dot.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION dot +#define __CLC_FUNCTION dot #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/geometric/fast_distance.cl b/libclc/opencl/lib/generic/geometric/fast_distance.cl index bc30c298a2b9..a9c2c66dc9a1 100644 --- a/libclc/opencl/lib/generic/geometric/fast_distance.cl +++ b/libclc/opencl/lib/generic/geometric/fast_distance.cl @@ -9,7 +9,7 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION fast_distance +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fast_distance #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/geometric/fast_length.cl b/libclc/opencl/lib/generic/geometric/fast_length.cl index 407080900cce..8e6ce1aa5c56 100644 --- a/libclc/opencl/lib/generic/geometric/fast_length.cl +++ b/libclc/opencl/lib/generic/geometric/fast_length.cl @@ -9,7 +9,7 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION fast_length +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fast_length #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/geometric/fast_normalize.cl b/libclc/opencl/lib/generic/geometric/fast_normalize.cl index 195081371a5b..d2f7111516f4 100644 --- a/libclc/opencl/lib/generic/geometric/fast_normalize.cl +++ b/libclc/opencl/lib/generic/geometric/fast_normalize.cl @@ -9,8 +9,8 @@ #include #include -#define FUNCTION fast_normalize -#define __FLOAT_ONLY +#define __CLC_FUNCTION fast_normalize +#define __CLC_FLOAT_ONLY #define __CLC_GEOMETRIC_RET_GENTYPE #define __CLC_BODY diff --git a/libclc/opencl/lib/generic/geometric/length.cl b/libclc/opencl/lib/generic/geometric/length.cl index 0b0ebaa74d95..59e2055020d8 100644 --- a/libclc/opencl/lib/generic/geometric/length.cl +++ b/libclc/opencl/lib/generic/geometric/length.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION length +#define __CLC_FUNCTION length #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/geometric/normalize.cl b/libclc/opencl/lib/generic/geometric/normalize.cl index eb84ffd38212..968d4001ae99 100644 --- a/libclc/opencl/lib/generic/geometric/normalize.cl +++ b/libclc/opencl/lib/generic/geometric/normalize.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION normalize +#define __CLC_FUNCTION normalize #define __CLC_GEOMETRIC_RET_GENTYPE #define __CLC_BODY diff --git a/libclc/opencl/lib/generic/integer/add_sat.cl b/libclc/opencl/lib/generic/integer/add_sat.cl index 50d989a47211..6cde612e5fb0 100644 --- a/libclc/opencl/lib/generic/integer/add_sat.cl +++ b/libclc/opencl/lib/generic/integer/add_sat.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION add_sat +#define __CLC_FUNCTION add_sat #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/bit_reverse.cl b/libclc/opencl/lib/generic/integer/bit_reverse.cl index 23181b6b3eba..b35a5c473a1e 100644 --- a/libclc/opencl/lib/generic/integer/bit_reverse.cl +++ b/libclc/opencl/lib/generic/integer/bit_reverse.cl @@ -11,7 +11,7 @@ #include #include -#define FUNCTION bit_reverse +#define __CLC_FUNCTION bit_reverse #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc index 0262f67732af..d3b8badcc8c9 100644 --- a/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc +++ b/libclc/opencl/lib/generic/integer/bitfield_extract_def.inc @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -#ifndef __IMPL_FUNCTION -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#ifndef __CLC_IMPL_FUNCTION +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #endif -_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE base, uint offset, - uint count) { - return __IMPL_FUNCTION(FUNCTION)(base, offset, count); +_CLC_OVERLOAD _CLC_DEF __CLC_RETTYPE __CLC_FUNCTION(__CLC_GENTYPE base, + uint offset, uint count) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(base, offset, count); } diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl index eaa4ac779cfd..ad49cd36b766 100644 --- a/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl +++ b/libclc/opencl/lib/generic/integer/bitfield_extract_signed.cl @@ -11,8 +11,8 @@ #include #include -#define FUNCTION bitfield_extract_signed -#define __RETTYPE __CLC_S_GENTYPE +#define __CLC_FUNCTION bitfield_extract_signed +#define __CLC_RETTYPE __CLC_S_GENTYPE #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl index fd63d5d6dee3..e2fe92cfcce8 100644 --- a/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl +++ b/libclc/opencl/lib/generic/integer/bitfield_extract_unsigned.cl @@ -11,8 +11,8 @@ #include #include -#define FUNCTION bitfield_extract_unsigned -#define __RETTYPE __CLC_U_GENTYPE +#define __CLC_FUNCTION bitfield_extract_unsigned +#define __CLC_RETTYPE __CLC_U_GENTYPE #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/bitfield_insert.cl b/libclc/opencl/lib/generic/integer/bitfield_insert.cl index 6b441155f393..c165bd756ffe 100644 --- a/libclc/opencl/lib/generic/integer/bitfield_insert.cl +++ b/libclc/opencl/lib/generic/integer/bitfield_insert.cl @@ -11,7 +11,7 @@ #include #include -#define FUNCTION bitfield_insert +#define __CLC_FUNCTION bitfield_insert #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/clz.cl b/libclc/opencl/lib/generic/integer/clz.cl index 58110df1eec2..d0509cba5913 100644 --- a/libclc/opencl/lib/generic/integer/clz.cl +++ b/libclc/opencl/lib/generic/integer/clz.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION clz +#define __CLC_FUNCTION clz #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/ctz.cl b/libclc/opencl/lib/generic/integer/ctz.cl index cbb167275a4a..640ccfa5f311 100644 --- a/libclc/opencl/lib/generic/integer/ctz.cl +++ b/libclc/opencl/lib/generic/integer/ctz.cl @@ -11,7 +11,7 @@ #include #include -#define FUNCTION ctz +#define __CLC_FUNCTION ctz #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/hadd.cl b/libclc/opencl/lib/generic/integer/hadd.cl index e9edc88bce02..0082524b55af 100644 --- a/libclc/opencl/lib/generic/integer/hadd.cl +++ b/libclc/opencl/lib/generic/integer/hadd.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION hadd +#define __CLC_FUNCTION hadd #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/mad24.cl b/libclc/opencl/lib/generic/integer/mad24.cl index 5e9dfb3b3031..fc8e0e317133 100644 --- a/libclc/opencl/lib/generic/integer/mad24.cl +++ b/libclc/opencl/lib/generic/integer/mad24.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION mad24 +#define __CLC_FUNCTION mad24 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/mad_hi.cl b/libclc/opencl/lib/generic/integer/mad_hi.cl index 786614e42bc8..20d096c59650 100644 --- a/libclc/opencl/lib/generic/integer/mad_hi.cl +++ b/libclc/opencl/lib/generic/integer/mad_hi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION mad_hi +#define __CLC_FUNCTION mad_hi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/mad_sat.cl b/libclc/opencl/lib/generic/integer/mad_sat.cl index 54ddc0eed2d1..310b31a0ca23 100644 --- a/libclc/opencl/lib/generic/integer/mad_sat.cl +++ b/libclc/opencl/lib/generic/integer/mad_sat.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION mad_sat +#define __CLC_FUNCTION mad_sat #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/mul24.cl b/libclc/opencl/lib/generic/integer/mul24.cl index e6db6b89c5d9..243bff77da91 100644 --- a/libclc/opencl/lib/generic/integer/mul24.cl +++ b/libclc/opencl/lib/generic/integer/mul24.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION mul24 +#define __CLC_FUNCTION mul24 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/mul_hi.cl b/libclc/opencl/lib/generic/integer/mul_hi.cl index 4c7b8c2caa99..99d3051a0df9 100644 --- a/libclc/opencl/lib/generic/integer/mul_hi.cl +++ b/libclc/opencl/lib/generic/integer/mul_hi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION mul_hi +#define __CLC_FUNCTION mul_hi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/popcount.cl b/libclc/opencl/lib/generic/integer/popcount.cl index 3c33c82302c3..e4219b4ff9c3 100644 --- a/libclc/opencl/lib/generic/integer/popcount.cl +++ b/libclc/opencl/lib/generic/integer/popcount.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION popcount +#define __CLC_FUNCTION popcount #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/rhadd.cl b/libclc/opencl/lib/generic/integer/rhadd.cl index f83551189942..df9f1eff3e2d 100644 --- a/libclc/opencl/lib/generic/integer/rhadd.cl +++ b/libclc/opencl/lib/generic/integer/rhadd.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION rhadd +#define __CLC_FUNCTION rhadd #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/rotate.cl b/libclc/opencl/lib/generic/integer/rotate.cl index 282270f3b3ff..f5b6195275de 100644 --- a/libclc/opencl/lib/generic/integer/rotate.cl +++ b/libclc/opencl/lib/generic/integer/rotate.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION rotate +#define __CLC_FUNCTION rotate #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/integer/sub_sat.cl b/libclc/opencl/lib/generic/integer/sub_sat.cl index 0b45df1118f1..81517022160a 100644 --- a/libclc/opencl/lib/generic/integer/sub_sat.cl +++ b/libclc/opencl/lib/generic/integer/sub_sat.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION sub_sat +#define __CLC_FUNCTION sub_sat #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/acos.cl b/libclc/opencl/lib/generic/math/acos.cl index 040b2b82f6f9..bdaeedf02e90 100644 --- a/libclc/opencl/lib/generic/math/acos.cl +++ b/libclc/opencl/lib/generic/math/acos.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION acos +#define __CLC_FUNCTION acos #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/acosh.cl b/libclc/opencl/lib/generic/math/acosh.cl index 55b60bc99614..c46532dbcbd9 100644 --- a/libclc/opencl/lib/generic/math/acosh.cl +++ b/libclc/opencl/lib/generic/math/acosh.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION acosh +#define __CLC_FUNCTION acosh #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/acospi.cl b/libclc/opencl/lib/generic/math/acospi.cl index 2537c992e2ad..b6b0fb91250d 100644 --- a/libclc/opencl/lib/generic/math/acospi.cl +++ b/libclc/opencl/lib/generic/math/acospi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION acospi +#define __CLC_FUNCTION acospi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/asin.cl b/libclc/opencl/lib/generic/math/asin.cl index 08dde1201392..31fc36d74601 100644 --- a/libclc/opencl/lib/generic/math/asin.cl +++ b/libclc/opencl/lib/generic/math/asin.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION asin +#define __CLC_FUNCTION asin #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/asinh.cl b/libclc/opencl/lib/generic/math/asinh.cl index 189392f50b5e..3bdd8099f079 100644 --- a/libclc/opencl/lib/generic/math/asinh.cl +++ b/libclc/opencl/lib/generic/math/asinh.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION asinh +#define __CLC_FUNCTION asinh #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/asinpi.cl b/libclc/opencl/lib/generic/math/asinpi.cl index b9327faaccba..6df9047ced3c 100644 --- a/libclc/opencl/lib/generic/math/asinpi.cl +++ b/libclc/opencl/lib/generic/math/asinpi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION asinpi +#define __CLC_FUNCTION asinpi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/atan.cl b/libclc/opencl/lib/generic/math/atan.cl index e2b1530f9e4c..157e50dd0e07 100644 --- a/libclc/opencl/lib/generic/math/atan.cl +++ b/libclc/opencl/lib/generic/math/atan.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION atan +#define __CLC_FUNCTION atan #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/atan2.cl b/libclc/opencl/lib/generic/math/atan2.cl index 9f3d4a965e58..7db630608867 100644 --- a/libclc/opencl/lib/generic/math/atan2.cl +++ b/libclc/opencl/lib/generic/math/atan2.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION atan2 +#define __CLC_FUNCTION atan2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/atan2pi.cl b/libclc/opencl/lib/generic/math/atan2pi.cl index 9b3fea163f68..ed57c920f1b2 100644 --- a/libclc/opencl/lib/generic/math/atan2pi.cl +++ b/libclc/opencl/lib/generic/math/atan2pi.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION atan2pi +#define __CLC_FUNCTION atan2pi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/atanh.cl b/libclc/opencl/lib/generic/math/atanh.cl index 5e4564d34702..d0337ff1ffac 100644 --- a/libclc/opencl/lib/generic/math/atanh.cl +++ b/libclc/opencl/lib/generic/math/atanh.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION atanh +#define __CLC_FUNCTION atanh #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/atanpi.cl b/libclc/opencl/lib/generic/math/atanpi.cl index ddeb38e64e99..11630db430c7 100644 --- a/libclc/opencl/lib/generic/math/atanpi.cl +++ b/libclc/opencl/lib/generic/math/atanpi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION atanpi +#define __CLC_FUNCTION atanpi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/cbrt.cl b/libclc/opencl/lib/generic/math/cbrt.cl index dccb49a47dc0..0d670150ed4c 100644 --- a/libclc/opencl/lib/generic/math/cbrt.cl +++ b/libclc/opencl/lib/generic/math/cbrt.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION cbrt +#define __CLC_FUNCTION cbrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/ceil.cl b/libclc/opencl/lib/generic/math/ceil.cl index e312281b98a2..e1bffbcb6860 100644 --- a/libclc/opencl/lib/generic/math/ceil.cl +++ b/libclc/opencl/lib/generic/math/ceil.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION ceil +#define __CLC_FUNCTION ceil #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/copysign.cl b/libclc/opencl/lib/generic/math/copysign.cl index 7aa4cb2da0ee..5234b534631c 100644 --- a/libclc/opencl/lib/generic/math/copysign.cl +++ b/libclc/opencl/lib/generic/math/copysign.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION copysign +#define __CLC_FUNCTION copysign #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/cos.cl b/libclc/opencl/lib/generic/math/cos.cl index fb40b6c00e3c..69c9c37cf3e8 100644 --- a/libclc/opencl/lib/generic/math/cos.cl +++ b/libclc/opencl/lib/generic/math/cos.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION cos +#define __CLC_FUNCTION cos #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/cosh.cl b/libclc/opencl/lib/generic/math/cosh.cl index 7106fc9f18fa..de9e750c1a7f 100644 --- a/libclc/opencl/lib/generic/math/cosh.cl +++ b/libclc/opencl/lib/generic/math/cosh.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION cosh +#define __CLC_FUNCTION cosh #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/cospi.cl b/libclc/opencl/lib/generic/math/cospi.cl index 0e242ad0d477..d9afd05bbabf 100644 --- a/libclc/opencl/lib/generic/math/cospi.cl +++ b/libclc/opencl/lib/generic/math/cospi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION cospi +#define __CLC_FUNCTION cospi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/erf.cl b/libclc/opencl/lib/generic/math/erf.cl index 83c38867e046..8d5a80ba10e4 100644 --- a/libclc/opencl/lib/generic/math/erf.cl +++ b/libclc/opencl/lib/generic/math/erf.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION erf +#define __CLC_FUNCTION erf #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/erfc.cl b/libclc/opencl/lib/generic/math/erfc.cl index 1c473999bba9..a3fda46237d9 100644 --- a/libclc/opencl/lib/generic/math/erfc.cl +++ b/libclc/opencl/lib/generic/math/erfc.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION erfc +#define __CLC_FUNCTION erfc #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/exp.cl b/libclc/opencl/lib/generic/math/exp.cl index 8125ce314e8e..a9fd07a26d0e 100644 --- a/libclc/opencl/lib/generic/math/exp.cl +++ b/libclc/opencl/lib/generic/math/exp.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION exp +#define __CLC_FUNCTION exp #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/exp10.cl b/libclc/opencl/lib/generic/math/exp10.cl index f8c0f94a990b..7db7bafd226f 100644 --- a/libclc/opencl/lib/generic/math/exp10.cl +++ b/libclc/opencl/lib/generic/math/exp10.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION exp10 +#define __CLC_FUNCTION exp10 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/exp2.cl b/libclc/opencl/lib/generic/math/exp2.cl index dc75c2201a46..1ffc34ae13ce 100644 --- a/libclc/opencl/lib/generic/math/exp2.cl +++ b/libclc/opencl/lib/generic/math/exp2.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION exp2 +#define __CLC_FUNCTION exp2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/expm1.cl b/libclc/opencl/lib/generic/math/expm1.cl index 7507786c2ad0..69c852fc7e79 100644 --- a/libclc/opencl/lib/generic/math/expm1.cl +++ b/libclc/opencl/lib/generic/math/expm1.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION expm1 +#define __CLC_FUNCTION expm1 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fabs.cl b/libclc/opencl/lib/generic/math/fabs.cl index 0d3128f3b2a1..d88c6c84be6d 100644 --- a/libclc/opencl/lib/generic/math/fabs.cl +++ b/libclc/opencl/lib/generic/math/fabs.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION fabs +#define __CLC_FUNCTION fabs #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fdim.cl b/libclc/opencl/lib/generic/math/fdim.cl index ecdcd3aef3a7..edaa8a3daef4 100644 --- a/libclc/opencl/lib/generic/math/fdim.cl +++ b/libclc/opencl/lib/generic/math/fdim.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION fdim +#define __CLC_FUNCTION fdim #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/floor.cl b/libclc/opencl/lib/generic/math/floor.cl index cda8363f9136..2edc1df21bc5 100644 --- a/libclc/opencl/lib/generic/math/floor.cl +++ b/libclc/opencl/lib/generic/math/floor.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION floor +#define __CLC_FUNCTION floor #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fma.cl b/libclc/opencl/lib/generic/math/fma.cl index 867b97506901..199d2683a898 100644 --- a/libclc/opencl/lib/generic/math/fma.cl +++ b/libclc/opencl/lib/generic/math/fma.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION fma +#define __CLC_FUNCTION fma #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fmax.cl b/libclc/opencl/lib/generic/math/fmax.cl index 7dbd6fe0bd7f..ae7a70a69fe3 100644 --- a/libclc/opencl/lib/generic/math/fmax.cl +++ b/libclc/opencl/lib/generic/math/fmax.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION fmax +#define __CLC_FUNCTION fmax #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fmin.cl b/libclc/opencl/lib/generic/math/fmin.cl index ee0dca480073..4b55a93bb70f 100644 --- a/libclc/opencl/lib/generic/math/fmin.cl +++ b/libclc/opencl/lib/generic/math/fmin.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION fmin +#define __CLC_FUNCTION fmin #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fmod.cl b/libclc/opencl/lib/generic/math/fmod.cl index ec1904d213d8..338b80334721 100644 --- a/libclc/opencl/lib/generic/math/fmod.cl +++ b/libclc/opencl/lib/generic/math/fmod.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION fmod +#define __CLC_FUNCTION fmod #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/fract.cl b/libclc/opencl/lib/generic/math/fract.cl index 9567072bf4c3..00e8eb456cf6 100644 --- a/libclc/opencl/lib/generic/math/fract.cl +++ b/libclc/opencl/lib/generic/math/fract.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION fract +#define __CLC_FUNCTION fract #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/frexp.cl b/libclc/opencl/lib/generic/math/frexp.cl index 3ac9be792ae6..207f9de1c99e 100644 --- a/libclc/opencl/lib/generic/math/frexp.cl +++ b/libclc/opencl/lib/generic/math/frexp.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION frexp +#define __CLC_FUNCTION frexp #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_cos.cl b/libclc/opencl/lib/generic/math/half_cos.cl index 377c4d164259..d1a5352b5267 100644 --- a/libclc/opencl/lib/generic/math/half_cos.cl +++ b/libclc/opencl/lib/generic/math/half_cos.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_cos +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_cos #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_divide.cl b/libclc/opencl/lib/generic/math/half_divide.cl index 730617ba67fb..10c418b9312d 100644 --- a/libclc/opencl/lib/generic/math/half_divide.cl +++ b/libclc/opencl/lib/generic/math/half_divide.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_divide +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_divide #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_exp.cl b/libclc/opencl/lib/generic/math/half_exp.cl index 05b8753b2faf..fbbf564cb697 100644 --- a/libclc/opencl/lib/generic/math/half_exp.cl +++ b/libclc/opencl/lib/generic/math/half_exp.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_exp +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_exp #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_exp10.cl b/libclc/opencl/lib/generic/math/half_exp10.cl index b4e46aaa771c..e655f5f89ff6 100644 --- a/libclc/opencl/lib/generic/math/half_exp10.cl +++ b/libclc/opencl/lib/generic/math/half_exp10.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_exp10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_exp10 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_exp2.cl b/libclc/opencl/lib/generic/math/half_exp2.cl index 51fe9f72477a..417d8b978e7d 100644 --- a/libclc/opencl/lib/generic/math/half_exp2.cl +++ b/libclc/opencl/lib/generic/math/half_exp2.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_exp2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_exp2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_log.cl b/libclc/opencl/lib/generic/math/half_log.cl index b21835204a17..89482c263f83 100644 --- a/libclc/opencl/lib/generic/math/half_log.cl +++ b/libclc/opencl/lib/generic/math/half_log.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_log +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_log #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_log10.cl b/libclc/opencl/lib/generic/math/half_log10.cl index acb50f99af19..b5ffe7edd363 100644 --- a/libclc/opencl/lib/generic/math/half_log10.cl +++ b/libclc/opencl/lib/generic/math/half_log10.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_log10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_log10 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_log2.cl b/libclc/opencl/lib/generic/math/half_log2.cl index c97e9e093878..cf13cf927c9c 100644 --- a/libclc/opencl/lib/generic/math/half_log2.cl +++ b/libclc/opencl/lib/generic/math/half_log2.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_log2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_log2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_powr.cl b/libclc/opencl/lib/generic/math/half_powr.cl index d61a18ab9132..d676f353f864 100644 --- a/libclc/opencl/lib/generic/math/half_powr.cl +++ b/libclc/opencl/lib/generic/math/half_powr.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_powr +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_powr #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_recip.cl b/libclc/opencl/lib/generic/math/half_recip.cl index 1917448c4e69..40a6c587dbaf 100644 --- a/libclc/opencl/lib/generic/math/half_recip.cl +++ b/libclc/opencl/lib/generic/math/half_recip.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_recip +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_recip #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_rsqrt.cl b/libclc/opencl/lib/generic/math/half_rsqrt.cl index d6ffa651fbe6..944976855dac 100644 --- a/libclc/opencl/lib/generic/math/half_rsqrt.cl +++ b/libclc/opencl/lib/generic/math/half_rsqrt.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_rsqrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_sin.cl b/libclc/opencl/lib/generic/math/half_sin.cl index baba0cca4984..3376f8344f39 100644 --- a/libclc/opencl/lib/generic/math/half_sin.cl +++ b/libclc/opencl/lib/generic/math/half_sin.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_sin +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_sin #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_sqrt.cl b/libclc/opencl/lib/generic/math/half_sqrt.cl index 7a59744ab11d..4939d6bd199d 100644 --- a/libclc/opencl/lib/generic/math/half_sqrt.cl +++ b/libclc/opencl/lib/generic/math/half_sqrt.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_sqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_sqrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/half_tan.cl b/libclc/opencl/lib/generic/math/half_tan.cl index acd4b011173d..2b77b3ae0c62 100644 --- a/libclc/opencl/lib/generic/math/half_tan.cl +++ b/libclc/opencl/lib/generic/math/half_tan.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION half_tan +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION half_tan #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/hypot.cl b/libclc/opencl/lib/generic/math/hypot.cl index 4c59e4696e5a..41aff6906987 100644 --- a/libclc/opencl/lib/generic/math/hypot.cl +++ b/libclc/opencl/lib/generic/math/hypot.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION hypot +#define __CLC_FUNCTION hypot #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/ilogb.cl b/libclc/opencl/lib/generic/math/ilogb.cl index 3829ce68fb57..027d587a4d46 100644 --- a/libclc/opencl/lib/generic/math/ilogb.cl +++ b/libclc/opencl/lib/generic/math/ilogb.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION ilogb +#define __CLC_FUNCTION ilogb #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/ldexp.cl b/libclc/opencl/lib/generic/math/ldexp.cl index 88c60716d782..3cdb9e1a760a 100644 --- a/libclc/opencl/lib/generic/math/ldexp.cl +++ b/libclc/opencl/lib/generic/math/ldexp.cl @@ -9,8 +9,8 @@ #include #include -#define FUNCTION ldexp -#define __IMPL_FUNCTION(x) __clc_ldexp +#define __CLC_FUNCTION ldexp +#define __CLC_IMPL_FUNCTION(x) __clc_ldexp #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/lgamma.cl b/libclc/opencl/lib/generic/math/lgamma.cl index 253ce31c1577..da8ceac986e4 100644 --- a/libclc/opencl/lib/generic/math/lgamma.cl +++ b/libclc/opencl/lib/generic/math/lgamma.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION lgamma +#define __CLC_FUNCTION lgamma #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/lgamma_r.cl b/libclc/opencl/lib/generic/math/lgamma_r.cl index 8753f6c75d8b..bb2d9bd283df 100644 --- a/libclc/opencl/lib/generic/math/lgamma_r.cl +++ b/libclc/opencl/lib/generic/math/lgamma_r.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION lgamma_r +#define __CLC_FUNCTION lgamma_r #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/log.cl b/libclc/opencl/lib/generic/math/log.cl index dbe3e3572703..06209999bf2f 100644 --- a/libclc/opencl/lib/generic/math/log.cl +++ b/libclc/opencl/lib/generic/math/log.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION log +#define __CLC_FUNCTION log #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/log10.cl b/libclc/opencl/lib/generic/math/log10.cl index d5137a7cab7a..466b602e1896 100644 --- a/libclc/opencl/lib/generic/math/log10.cl +++ b/libclc/opencl/lib/generic/math/log10.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION log10 +#define __CLC_FUNCTION log10 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/log1p.cl b/libclc/opencl/lib/generic/math/log1p.cl index d389ab72908c..c1a8711e1185 100644 --- a/libclc/opencl/lib/generic/math/log1p.cl +++ b/libclc/opencl/lib/generic/math/log1p.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION log1p +#define __CLC_FUNCTION log1p #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/log2.cl b/libclc/opencl/lib/generic/math/log2.cl index d1433bc82581..a52a52e28f63 100644 --- a/libclc/opencl/lib/generic/math/log2.cl +++ b/libclc/opencl/lib/generic/math/log2.cl @@ -10,7 +10,7 @@ #include #include -#define FUNCTION log2 +#define __CLC_FUNCTION log2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/logb.cl b/libclc/opencl/lib/generic/math/logb.cl index 037380a26a72..09d43fac3ad2 100644 --- a/libclc/opencl/lib/generic/math/logb.cl +++ b/libclc/opencl/lib/generic/math/logb.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION logb +#define __CLC_FUNCTION logb #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/mad.cl b/libclc/opencl/lib/generic/math/mad.cl index 3fe2dd9bed5f..05bd228251c1 100644 --- a/libclc/opencl/lib/generic/math/mad.cl +++ b/libclc/opencl/lib/generic/math/mad.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION mad +#define __CLC_FUNCTION mad #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/maxmag.cl b/libclc/opencl/lib/generic/math/maxmag.cl index 4f9dbeb8d27e..30e7bb21302d 100644 --- a/libclc/opencl/lib/generic/math/maxmag.cl +++ b/libclc/opencl/lib/generic/math/maxmag.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION maxmag +#define __CLC_FUNCTION maxmag #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/minmag.cl b/libclc/opencl/lib/generic/math/minmag.cl index efe72b6f545c..3732dbfe7af4 100644 --- a/libclc/opencl/lib/generic/math/minmag.cl +++ b/libclc/opencl/lib/generic/math/minmag.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION minmag +#define __CLC_FUNCTION minmag #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/modf.cl b/libclc/opencl/lib/generic/math/modf.cl index de6524e02f73..306070bf8923 100644 --- a/libclc/opencl/lib/generic/math/modf.cl +++ b/libclc/opencl/lib/generic/math/modf.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION modf +#define __CLC_FUNCTION modf #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_cos.cl b/libclc/opencl/lib/generic/math/native_cos.cl index 85944a03c546..8508df37a9b6 100644 --- a/libclc/opencl/lib/generic/math/native_cos.cl +++ b/libclc/opencl/lib/generic/math/native_cos.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_cos +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_cos #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_divide.cl b/libclc/opencl/lib/generic/math/native_divide.cl index 8efd8cc21b5d..ea28bf6e195c 100644 --- a/libclc/opencl/lib/generic/math/native_divide.cl +++ b/libclc/opencl/lib/generic/math/native_divide.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_divide +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_divide #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_exp.cl b/libclc/opencl/lib/generic/math/native_exp.cl index 42208371e195..aedb0953c534 100644 --- a/libclc/opencl/lib/generic/math/native_exp.cl +++ b/libclc/opencl/lib/generic/math/native_exp.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_exp +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_exp #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_exp10.cl b/libclc/opencl/lib/generic/math/native_exp10.cl index aa2585ff4311..eb184647639f 100644 --- a/libclc/opencl/lib/generic/math/native_exp10.cl +++ b/libclc/opencl/lib/generic/math/native_exp10.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_exp10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_exp10 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_exp2.cl b/libclc/opencl/lib/generic/math/native_exp2.cl index 8955b28b367c..52e814f5ee53 100644 --- a/libclc/opencl/lib/generic/math/native_exp2.cl +++ b/libclc/opencl/lib/generic/math/native_exp2.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_exp2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_exp2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_log.cl b/libclc/opencl/lib/generic/math/native_log.cl index 334f7c04e389..d1d4ae2f15f1 100644 --- a/libclc/opencl/lib/generic/math/native_log.cl +++ b/libclc/opencl/lib/generic/math/native_log.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_log +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_log #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_log10.cl b/libclc/opencl/lib/generic/math/native_log10.cl index a65938ee4c6c..680b9cb341bf 100644 --- a/libclc/opencl/lib/generic/math/native_log10.cl +++ b/libclc/opencl/lib/generic/math/native_log10.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_log10 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_log10 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_log2.cl b/libclc/opencl/lib/generic/math/native_log2.cl index f10b533f91f4..3a104357afa8 100644 --- a/libclc/opencl/lib/generic/math/native_log2.cl +++ b/libclc/opencl/lib/generic/math/native_log2.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_log2 +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_log2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_powr.cl b/libclc/opencl/lib/generic/math/native_powr.cl index 8301443cfbcf..f325d53321b1 100644 --- a/libclc/opencl/lib/generic/math/native_powr.cl +++ b/libclc/opencl/lib/generic/math/native_powr.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_powr +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_powr #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_recip.cl b/libclc/opencl/lib/generic/math/native_recip.cl index 145845b45514..20d81ed9f1b7 100644 --- a/libclc/opencl/lib/generic/math/native_recip.cl +++ b/libclc/opencl/lib/generic/math/native_recip.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_recip +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_recip #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_rsqrt.cl b/libclc/opencl/lib/generic/math/native_rsqrt.cl index cbb5a754fc96..8f308e74f334 100644 --- a/libclc/opencl/lib/generic/math/native_rsqrt.cl +++ b/libclc/opencl/lib/generic/math/native_rsqrt.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_rsqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_rsqrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_sin.cl b/libclc/opencl/lib/generic/math/native_sin.cl index dfcc882614a6..ea7167822a81 100644 --- a/libclc/opencl/lib/generic/math/native_sin.cl +++ b/libclc/opencl/lib/generic/math/native_sin.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_sin +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_sin #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_sqrt.cl b/libclc/opencl/lib/generic/math/native_sqrt.cl index a0be41d33fa4..fa38bdb3301d 100644 --- a/libclc/opencl/lib/generic/math/native_sqrt.cl +++ b/libclc/opencl/lib/generic/math/native_sqrt.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_sqrt +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_sqrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/native_tan.cl b/libclc/opencl/lib/generic/math/native_tan.cl index 86eee4f5913f..8cebf93bae91 100644 --- a/libclc/opencl/lib/generic/math/native_tan.cl +++ b/libclc/opencl/lib/generic/math/native_tan.cl @@ -9,8 +9,8 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION native_tan +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION native_tan #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/nextafter.cl b/libclc/opencl/lib/generic/math/nextafter.cl index 9c30e5668a56..256f06bf5855 100644 --- a/libclc/opencl/lib/generic/math/nextafter.cl +++ b/libclc/opencl/lib/generic/math/nextafter.cl @@ -9,8 +9,8 @@ #include #include -#define FUNCTION nextafter -#define __IMPL_FUNCTION(x) __clc_nextafter +#define __CLC_FUNCTION nextafter +#define __CLC_IMPL_FUNCTION(x) __clc_nextafter #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/pow.cl b/libclc/opencl/lib/generic/math/pow.cl index 2fddb7e88964..bf4351926573 100644 --- a/libclc/opencl/lib/generic/math/pow.cl +++ b/libclc/opencl/lib/generic/math/pow.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION pow +#define __CLC_FUNCTION pow #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/pown.cl b/libclc/opencl/lib/generic/math/pown.cl index 0a5ee893a45a..8c03be6bee16 100644 --- a/libclc/opencl/lib/generic/math/pown.cl +++ b/libclc/opencl/lib/generic/math/pown.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION pown +#define __CLC_FUNCTION pown #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/powr.cl b/libclc/opencl/lib/generic/math/powr.cl index 13af03ff1327..680f9be894ab 100644 --- a/libclc/opencl/lib/generic/math/powr.cl +++ b/libclc/opencl/lib/generic/math/powr.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION powr +#define __CLC_FUNCTION powr #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/remainder.cl b/libclc/opencl/lib/generic/math/remainder.cl index 224a66bc9b27..42b37f847f6d 100644 --- a/libclc/opencl/lib/generic/math/remainder.cl +++ b/libclc/opencl/lib/generic/math/remainder.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION remainder +#define __CLC_FUNCTION remainder #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/rint.cl b/libclc/opencl/lib/generic/math/rint.cl index 489883c97c14..8beaa7ae064c 100644 --- a/libclc/opencl/lib/generic/math/rint.cl +++ b/libclc/opencl/lib/generic/math/rint.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION rint +#define __CLC_FUNCTION rint #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/rootn.cl b/libclc/opencl/lib/generic/math/rootn.cl index e8dba17b9395..8f25ee1d31b4 100644 --- a/libclc/opencl/lib/generic/math/rootn.cl +++ b/libclc/opencl/lib/generic/math/rootn.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION rootn +#define __CLC_FUNCTION rootn #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/round.cl b/libclc/opencl/lib/generic/math/round.cl index c45e681b1c22..79b04752e9de 100644 --- a/libclc/opencl/lib/generic/math/round.cl +++ b/libclc/opencl/lib/generic/math/round.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION round +#define __CLC_FUNCTION round #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/rsqrt.cl b/libclc/opencl/lib/generic/math/rsqrt.cl index 47e4f450ed1a..f7137f5a007d 100644 --- a/libclc/opencl/lib/generic/math/rsqrt.cl +++ b/libclc/opencl/lib/generic/math/rsqrt.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION rsqrt +#define __CLC_FUNCTION rsqrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/sin.cl b/libclc/opencl/lib/generic/math/sin.cl index ff7db2a5b3cb..e198198bcbe8 100644 --- a/libclc/opencl/lib/generic/math/sin.cl +++ b/libclc/opencl/lib/generic/math/sin.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION sin +#define __CLC_FUNCTION sin #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/sincos.cl b/libclc/opencl/lib/generic/math/sincos.cl index c8871fd2b002..845e966966c5 100644 --- a/libclc/opencl/lib/generic/math/sincos.cl +++ b/libclc/opencl/lib/generic/math/sincos.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION sincos +#define __CLC_FUNCTION sincos #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/sinh.cl b/libclc/opencl/lib/generic/math/sinh.cl index bd6ea20e7e89..6d5f1b1c269c 100644 --- a/libclc/opencl/lib/generic/math/sinh.cl +++ b/libclc/opencl/lib/generic/math/sinh.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION sinh +#define __CLC_FUNCTION sinh #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/sinpi.cl b/libclc/opencl/lib/generic/math/sinpi.cl index badecd090985..33b04532e776 100644 --- a/libclc/opencl/lib/generic/math/sinpi.cl +++ b/libclc/opencl/lib/generic/math/sinpi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION sinpi +#define __CLC_FUNCTION sinpi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/sqrt.cl b/libclc/opencl/lib/generic/math/sqrt.cl index 3b05b2586448..9984ddb9d5a4 100644 --- a/libclc/opencl/lib/generic/math/sqrt.cl +++ b/libclc/opencl/lib/generic/math/sqrt.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION sqrt +#define __CLC_FUNCTION sqrt #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/tan.cl b/libclc/opencl/lib/generic/math/tan.cl index 95409fc5b092..89a525111167 100644 --- a/libclc/opencl/lib/generic/math/tan.cl +++ b/libclc/opencl/lib/generic/math/tan.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION tan +#define __CLC_FUNCTION tan #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/tanh.cl b/libclc/opencl/lib/generic/math/tanh.cl index a77878bb7966..92307eafdd05 100644 --- a/libclc/opencl/lib/generic/math/tanh.cl +++ b/libclc/opencl/lib/generic/math/tanh.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION tanh +#define __CLC_FUNCTION tanh #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/tanpi.cl b/libclc/opencl/lib/generic/math/tanpi.cl index 6b5805ec3cd0..47e943ba7741 100644 --- a/libclc/opencl/lib/generic/math/tanpi.cl +++ b/libclc/opencl/lib/generic/math/tanpi.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION tanpi +#define __CLC_FUNCTION tanpi #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/tgamma.cl b/libclc/opencl/lib/generic/math/tgamma.cl index 8f35d22426a7..2749cf3dfc0f 100644 --- a/libclc/opencl/lib/generic/math/tgamma.cl +++ b/libclc/opencl/lib/generic/math/tgamma.cl @@ -9,6 +9,6 @@ #include #include -#define FUNCTION tgamma +#define __CLC_FUNCTION tgamma #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/math/trunc.cl b/libclc/opencl/lib/generic/math/trunc.cl index ea56329bc8de..b27699699534 100644 --- a/libclc/opencl/lib/generic/math/trunc.cl +++ b/libclc/opencl/lib/generic/math/trunc.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION trunc +#define __CLC_FUNCTION trunc #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/misc/shuffle.cl b/libclc/opencl/lib/generic/misc/shuffle.cl index 15295bf7d907..f9187be453e6 100644 --- a/libclc/opencl/lib/generic/misc/shuffle.cl +++ b/libclc/opencl/lib/generic/misc/shuffle.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION shuffle +#define __CLC_FUNCTION shuffle #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/misc/shuffle2.cl b/libclc/opencl/lib/generic/misc/shuffle2.cl index 7e4c5f092268..0008340b622d 100644 --- a/libclc/opencl/lib/generic/misc/shuffle2.cl +++ b/libclc/opencl/lib/generic/misc/shuffle2.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION shuffle2 +#define __CLC_FUNCTION shuffle2 #define __CLC_BODY #include diff --git a/libclc/opencl/lib/generic/relational/all.cl b/libclc/opencl/lib/generic/relational/all.cl index 5ac4a465365a..f72da2cb622c 100644 --- a/libclc/opencl/lib/generic/relational/all.cl +++ b/libclc/opencl/lib/generic/relational/all.cl @@ -9,17 +9,17 @@ #include #include -#define ALL_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int all(TYPE v) +#define __CLC_ALL_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int all(TYPE v) -#define ALL_VECTORIZE(TYPE) \ - ALL_ID(TYPE) { return __clc_all(v); } \ - ALL_ID(TYPE##2) { return __clc_all(v); } \ - ALL_ID(TYPE##3) { return __clc_all(v); } \ - ALL_ID(TYPE##4) { return __clc_all(v); } \ - ALL_ID(TYPE##8) { return __clc_all(v); } \ - ALL_ID(TYPE##16) { return __clc_all(v); } +#define __CLC_ALL_VECTORIZE(TYPE) \ + __CLC_ALL_ID(TYPE) { return __clc_all(v); } \ + __CLC_ALL_ID(TYPE##2) { return __clc_all(v); } \ + __CLC_ALL_ID(TYPE##3) { return __clc_all(v); } \ + __CLC_ALL_ID(TYPE##4) { return __clc_all(v); } \ + __CLC_ALL_ID(TYPE##8) { return __clc_all(v); } \ + __CLC_ALL_ID(TYPE##16) { return __clc_all(v); } -ALL_VECTORIZE(char) -ALL_VECTORIZE(short) -ALL_VECTORIZE(int) -ALL_VECTORIZE(long) +__CLC_ALL_VECTORIZE(char) +__CLC_ALL_VECTORIZE(short) +__CLC_ALL_VECTORIZE(int) +__CLC_ALL_VECTORIZE(long) diff --git a/libclc/opencl/lib/generic/relational/any.cl b/libclc/opencl/lib/generic/relational/any.cl index 507f2b170693..b2b48dbf1ab0 100644 --- a/libclc/opencl/lib/generic/relational/any.cl +++ b/libclc/opencl/lib/generic/relational/any.cl @@ -9,17 +9,17 @@ #include #include -#define ANY_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int any(TYPE v) +#define __CLC_ANY_ID(TYPE) _CLC_OVERLOAD _CLC_DEF int any(TYPE v) -#define ANY_VECTORIZE(TYPE) \ - ANY_ID(TYPE) { return __clc_any(v); } \ - ANY_ID(TYPE##2) { return __clc_any(v); } \ - ANY_ID(TYPE##3) { return __clc_any(v); } \ - ANY_ID(TYPE##4) { return __clc_any(v); } \ - ANY_ID(TYPE##8) { return __clc_any(v); } \ - ANY_ID(TYPE##16) { return __clc_any(v); } +#define __CLC_ANY_VECTORIZE(TYPE) \ + __CLC_ANY_ID(TYPE) { return __clc_any(v); } \ + __CLC_ANY_ID(TYPE##2) { return __clc_any(v); } \ + __CLC_ANY_ID(TYPE##3) { return __clc_any(v); } \ + __CLC_ANY_ID(TYPE##4) { return __clc_any(v); } \ + __CLC_ANY_ID(TYPE##8) { return __clc_any(v); } \ + __CLC_ANY_ID(TYPE##16) { return __clc_any(v); } -ANY_VECTORIZE(char) -ANY_VECTORIZE(short) -ANY_VECTORIZE(int) -ANY_VECTORIZE(long) +__CLC_ANY_VECTORIZE(char) +__CLC_ANY_VECTORIZE(short) +__CLC_ANY_VECTORIZE(int) +__CLC_ANY_VECTORIZE(long) diff --git a/libclc/opencl/lib/generic/relational/binary_def.inc b/libclc/opencl/lib/generic/relational/binary_def.inc index 8416da0475a2..b60d9bbe45e1 100644 --- a/libclc/opencl/lib/generic/relational/binary_def.inc +++ b/libclc/opencl/lib/generic/relational/binary_def.inc @@ -8,16 +8,17 @@ #include -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #if __CLC_VECSIZE_OR_1 == 1 -#define __RETTYPE __CLC_INTN +#define __CLC_RETTYPE __CLC_INTN #else -#define __RETTYPE __CLC_BIT_INTN +#define __CLC_RETTYPE __CLC_BIT_INTN #endif -_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b) { - return __IMPL_FUNCTION(FUNCTION)(a, b); +_CLC_OVERLOAD _CLC_DEF __CLC_RETTYPE __CLC_FUNCTION(__CLC_GENTYPE a, + __CLC_GENTYPE b) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a, b); } -#undef __RETTYPE +#undef __CLC_RETTYPE diff --git a/libclc/opencl/lib/generic/relational/isequal.cl b/libclc/opencl/lib/generic/relational/isequal.cl index 83002c28ceab..40718dbca9d0 100644 --- a/libclc/opencl/lib/generic/relational/isequal.cl +++ b/libclc/opencl/lib/generic/relational/isequal.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isequal +#define __CLC_FUNCTION isequal #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isfinite.cl b/libclc/opencl/lib/generic/relational/isfinite.cl index a2017133cead..f055e295416c 100644 --- a/libclc/opencl/lib/generic/relational/isfinite.cl +++ b/libclc/opencl/lib/generic/relational/isfinite.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isfinite +#define __CLC_FUNCTION isfinite #define __CLC_BODY "unary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isgreater.cl b/libclc/opencl/lib/generic/relational/isgreater.cl index 6eeb2b21c049..ae73769267dc 100644 --- a/libclc/opencl/lib/generic/relational/isgreater.cl +++ b/libclc/opencl/lib/generic/relational/isgreater.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isgreater +#define __CLC_FUNCTION isgreater #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isgreaterequal.cl b/libclc/opencl/lib/generic/relational/isgreaterequal.cl index e4e4535fd30d..725c3289bb05 100644 --- a/libclc/opencl/lib/generic/relational/isgreaterequal.cl +++ b/libclc/opencl/lib/generic/relational/isgreaterequal.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isgreaterequal +#define __CLC_FUNCTION isgreaterequal #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isinf.cl b/libclc/opencl/lib/generic/relational/isinf.cl index 2ab8c182e02a..9558b0f9eb5e 100644 --- a/libclc/opencl/lib/generic/relational/isinf.cl +++ b/libclc/opencl/lib/generic/relational/isinf.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isinf +#define __CLC_FUNCTION isinf #define __CLC_BODY "unary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isless.cl b/libclc/opencl/lib/generic/relational/isless.cl index 4212970e7671..6ed0857d354a 100644 --- a/libclc/opencl/lib/generic/relational/isless.cl +++ b/libclc/opencl/lib/generic/relational/isless.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isless +#define __CLC_FUNCTION isless #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/islessequal.cl b/libclc/opencl/lib/generic/relational/islessequal.cl index e7aec262fc76..c4697ca56f98 100644 --- a/libclc/opencl/lib/generic/relational/islessequal.cl +++ b/libclc/opencl/lib/generic/relational/islessequal.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION islessequal +#define __CLC_FUNCTION islessequal #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/islessgreater.cl b/libclc/opencl/lib/generic/relational/islessgreater.cl index b775d2484550..eb4cdc24c067 100644 --- a/libclc/opencl/lib/generic/relational/islessgreater.cl +++ b/libclc/opencl/lib/generic/relational/islessgreater.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION islessgreater +#define __CLC_FUNCTION islessgreater #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isnan.cl b/libclc/opencl/lib/generic/relational/isnan.cl index 4b7eeb5b919b..4bae2ae077c1 100644 --- a/libclc/opencl/lib/generic/relational/isnan.cl +++ b/libclc/opencl/lib/generic/relational/isnan.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isnan +#define __CLC_FUNCTION isnan #define __CLC_BODY "unary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isnormal.cl b/libclc/opencl/lib/generic/relational/isnormal.cl index 60ce9dccaeaf..d4ea20a27cdb 100644 --- a/libclc/opencl/lib/generic/relational/isnormal.cl +++ b/libclc/opencl/lib/generic/relational/isnormal.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isnormal +#define __CLC_FUNCTION isnormal #define __CLC_BODY "unary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isnotequal.cl b/libclc/opencl/lib/generic/relational/isnotequal.cl index abb4d3a85966..e24d57564cb8 100644 --- a/libclc/opencl/lib/generic/relational/isnotequal.cl +++ b/libclc/opencl/lib/generic/relational/isnotequal.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isnotequal +#define __CLC_FUNCTION isnotequal #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isordered.cl b/libclc/opencl/lib/generic/relational/isordered.cl index 684ee425e120..773cf1947e68 100644 --- a/libclc/opencl/lib/generic/relational/isordered.cl +++ b/libclc/opencl/lib/generic/relational/isordered.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isordered +#define __CLC_FUNCTION isordered #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/isunordered.cl b/libclc/opencl/lib/generic/relational/isunordered.cl index 84aa8cafb111..44427ea5e604 100644 --- a/libclc/opencl/lib/generic/relational/isunordered.cl +++ b/libclc/opencl/lib/generic/relational/isunordered.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION isunordered +#define __CLC_FUNCTION isunordered #define __CLC_BODY "binary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/signbit.cl b/libclc/opencl/lib/generic/relational/signbit.cl index d30fea7b9f6f..b1f3ac89933f 100644 --- a/libclc/opencl/lib/generic/relational/signbit.cl +++ b/libclc/opencl/lib/generic/relational/signbit.cl @@ -9,7 +9,7 @@ #include #include -#define FUNCTION signbit +#define __CLC_FUNCTION signbit #define __CLC_BODY "unary_def.inc" #include diff --git a/libclc/opencl/lib/generic/relational/unary_def.inc b/libclc/opencl/lib/generic/relational/unary_def.inc index f184e3cf0be5..1655ace114aa 100644 --- a/libclc/opencl/lib/generic/relational/unary_def.inc +++ b/libclc/opencl/lib/generic/relational/unary_def.inc @@ -8,16 +8,16 @@ #include -#define __IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) +#define __CLC_IMPL_FUNCTION(x) __CLC_CONCAT(__clc_, x) #if __CLC_VECSIZE_OR_1 == 1 -#define __RETTYPE __CLC_INTN +#define __CLC_RETTYPE __CLC_INTN #else -#define __RETTYPE __CLC_BIT_INTN +#define __CLC_RETTYPE __CLC_BIT_INTN #endif -_CLC_OVERLOAD _CLC_DEF __RETTYPE FUNCTION(__CLC_GENTYPE a) { - return __IMPL_FUNCTION(FUNCTION)(a); +_CLC_OVERLOAD _CLC_DEF __CLC_RETTYPE __CLC_FUNCTION(__CLC_GENTYPE a) { + return __CLC_IMPL_FUNCTION(__CLC_FUNCTION)(a); } -#undef __RETTYPE +#undef __CLC_RETTYPE diff --git a/libclc/opencl/lib/generic/shared/vload.inc b/libclc/opencl/lib/generic/shared/vload.inc index 62cb040aad18..955b77b48ec3 100644 --- a/libclc/opencl/lib/generic/shared/vload.inc +++ b/libclc/opencl/lib/generic/shared/vload.inc @@ -6,33 +6,34 @@ // //===----------------------------------------------------------------------===// -#define CLC_VLOAD_NAME(x) __CLC_XCONCAT(__CLC_XCONCAT(x, vload), __CLC_VECSIZE) -#define CLC_VLOAD_HALF_NAME(x) \ +#define __CLC_VLOAD_NAME(x) \ + __CLC_XCONCAT(__CLC_XCONCAT(x, vload), __CLC_VECSIZE) +#define __CLC_VLOAD_HALF_NAME(x) \ __CLC_XCONCAT(__CLC_XCONCAT(x, vload_half), __CLC_VECSIZE) -#define CLC_VLOADA_HALF_NAME(x) \ +#define __CLC_VLOADA_HALF_NAME(x) \ __CLC_XCONCAT(__CLC_XCONCAT(x, vloada_half), __CLC_VECSIZE) #ifndef __CLC_SCALAR -#define CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define __CLC_VLOAD_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) -#define VLOAD_DEF(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF CLC_VLOAD_TY CLC_VLOAD_NAME()( \ +#define __CLC_VLOAD_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_VLOAD_TY __CLC_VLOAD_NAME()( \ size_t offset, const ADDRSPACE __CLC_SCALAR_GENTYPE *x) { \ - return CLC_VLOAD_NAME(__clc_)(offset, x); \ + return __CLC_VLOAD_NAME(__clc_)(offset, x); \ } -VLOAD_DEF(__private) -VLOAD_DEF(__local) -VLOAD_DEF(__constant) -VLOAD_DEF(__global) +__CLC_VLOAD_DEF(__private) +__CLC_VLOAD_DEF(__local) +__CLC_VLOAD_DEF(__constant) +__CLC_VLOAD_DEF(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -VLOAD_DEF(__generic) +__CLC_VLOAD_DEF(__generic) #endif -#undef VLOAD_DEF -#undef CLC_VLOAD_TY +#undef __CLC_VLOAD_DEF +#undef __CLC_VLOAD_TY #endif @@ -42,30 +43,30 @@ VLOAD_DEF(__generic) #ifdef __CLC_FPSIZE #if __CLC_FPSIZE == 32 -#define VLOAD_HALF_DEF(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOAD_HALF_NAME()( \ +#define __CLC_VLOAD_HALF_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_VLOAD_HALF_NAME()( \ size_t offset, const ADDRSPACE half *mem) { \ - return CLC_VLOAD_HALF_NAME(__clc_)(offset, mem); \ + return __CLC_VLOAD_HALF_NAME(__clc_)(offset, mem); \ } \ \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE CLC_VLOADA_HALF_NAME()( \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __CLC_VLOADA_HALF_NAME()( \ size_t offset, const ADDRSPACE half *mem) { \ - return CLC_VLOADA_HALF_NAME(__clc_)(offset, mem); \ + return __CLC_VLOADA_HALF_NAME(__clc_)(offset, mem); \ } -VLOAD_HALF_DEF(__private) -VLOAD_HALF_DEF(__local) -VLOAD_HALF_DEF(__constant) -VLOAD_HALF_DEF(__global) +__CLC_VLOAD_HALF_DEF(__private) +__CLC_VLOAD_HALF_DEF(__local) +__CLC_VLOAD_HALF_DEF(__constant) +__CLC_VLOAD_HALF_DEF(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -VLOAD_HALF_DEF(__generic) +__CLC_VLOAD_HALF_DEF(__generic) #endif -#undef VLOAD_HALF_DEF +#undef __CLC_VLOAD_HALF_DEF #endif #endif -#undef CLC_VLOAD_NAME -#undef CLC_VLOAD_HALF_NAME -#undef CLC_VLOADA_HALF_NAME +#undef __CLC_VLOAD_NAME +#undef __CLC_VLOAD_HALF_NAME +#undef __CLC_VLOADA_HALF_NAME diff --git a/libclc/opencl/lib/generic/shared/vstore.inc b/libclc/opencl/lib/generic/shared/vstore.inc index 4bdce0719912..79ae9f9c9df2 100644 --- a/libclc/opencl/lib/generic/shared/vstore.inc +++ b/libclc/opencl/lib/generic/shared/vstore.inc @@ -6,31 +6,32 @@ // //===----------------------------------------------------------------------===// -#define CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) -#define CLC_VSTORE_NAME(x) \ +#define __CLC_VSTORE_TY __CLC_XCONCAT(less_aligned_, __CLC_GENTYPE) +#define __CLC_VSTORE_NAME(x) \ __CLC_XCONCAT(__CLC_XCONCAT(x, vstore), __CLC_VECSIZE) -#define CLC_VSTORE_HALF_NAME(x, y) \ +#define __CLC_VSTORE_HALF_NAME(x, y) \ __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstore_half), __CLC_VECSIZE), y) -#define CLC_VSTOREA_HALF_NAME(x, y) \ +#define __CLC_VSTOREA_HALF_NAME(x, y) \ __CLC_XCONCAT(__CLC_XCONCAT(__CLC_XCONCAT(x, vstorea_half), __CLC_VECSIZE), y) #ifndef __CLC_SCALAR -#define CLC_VSTORE_DEF(ADDRSPACE) \ - _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_NAME()( \ - CLC_VSTORE_TY data, size_t offset, ADDRSPACE __CLC_SCALAR_GENTYPE *p) { \ - return CLC_VSTORE_NAME(__clc_)(data, offset, p); \ +#define __CLC_VSTORE_DEF(ADDRSPACE) \ + _CLC_OVERLOAD _CLC_DEF void __CLC_VSTORE_NAME()( \ + __CLC_VSTORE_TY data, size_t offset, \ + ADDRSPACE __CLC_SCALAR_GENTYPE *p) { \ + return __CLC_VSTORE_NAME(__clc_)(data, offset, p); \ } -CLC_VSTORE_DEF(__private) -CLC_VSTORE_DEF(__local) -CLC_VSTORE_DEF(__global) +__CLC_VSTORE_DEF(__private) +__CLC_VSTORE_DEF(__local) +__CLC_VSTORE_DEF(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_VSTORE_DEF(__generic) +__CLC_VSTORE_DEF(__generic) #endif -#undef CLC_VSTORE_DEF +#undef __CLC_VSTORE_DEF #endif // __CLC_SCALAR @@ -39,39 +40,39 @@ CLC_VSTORE_DEF(__generic) #ifdef __CLC_FPSIZE #if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64 -#define CLC_VSTORE_HALF_DEF(ADDRSPACE, SUFFIX) \ - _CLC_OVERLOAD _CLC_DEF void CLC_VSTORE_HALF_NAME(, SUFFIX)( \ - CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ - CLC_VSTORE_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ +#define __CLC_VSTORE_HALF_DEF(ADDRSPACE, SUFFIX) \ + _CLC_OVERLOAD _CLC_DEF void __CLC_VSTORE_HALF_NAME(, SUFFIX)( \ + __CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ + __CLC_VSTORE_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ } \ \ - _CLC_OVERLOAD _CLC_DEF void CLC_VSTOREA_HALF_NAME(, SUFFIX)( \ - CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ - CLC_VSTOREA_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ + _CLC_OVERLOAD _CLC_DEF void __CLC_VSTOREA_HALF_NAME(, SUFFIX)( \ + __CLC_VSTORE_TY data, size_t offset, ADDRSPACE half *p) { \ + __CLC_VSTOREA_HALF_NAME(__clc_, SUFFIX)(data, offset, p); \ } -#define CLC_VSTORE_HALF_DEF_ALL_MODES(ADDRSPACE) \ - CLC_VSTORE_HALF_DEF(ADDRSPACE, ) \ - CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtz) \ - CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtn) \ - CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtp) \ - CLC_VSTORE_HALF_DEF(ADDRSPACE, _rte) +#define __CLC_VSTORE_HALF_DEF_ALL_MODES(ADDRSPACE) \ + __CLC_VSTORE_HALF_DEF(ADDRSPACE, ) \ + __CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtz) \ + __CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtn) \ + __CLC_VSTORE_HALF_DEF(ADDRSPACE, _rtp) \ + __CLC_VSTORE_HALF_DEF(ADDRSPACE, _rte) -CLC_VSTORE_HALF_DEF_ALL_MODES(__private) -CLC_VSTORE_HALF_DEF_ALL_MODES(__local) -CLC_VSTORE_HALF_DEF_ALL_MODES(__global) +__CLC_VSTORE_HALF_DEF_ALL_MODES(__private) +__CLC_VSTORE_HALF_DEF_ALL_MODES(__local) +__CLC_VSTORE_HALF_DEF_ALL_MODES(__global) #if _CLC_DISTINCT_GENERIC_AS_SUPPORTED -CLC_VSTORE_HALF_DEF_ALL_MODES(__generic) +__CLC_VSTORE_HALF_DEF_ALL_MODES(__generic) #endif -#undef CLC_VSTORE_HALF_DEF -#undef CLC_VSTORE_HALF_DEF_ALL_MODES +#undef __CLC_VSTORE_HALF_DEF +#undef __CLC_VSTORE_HALF_DEF_ALL_MODES #endif #endif -#undef CLC_VSTORE_TY -#undef CLC_VSTORE_NAME -#undef CLC_VSTORE_HALF_NAME -#undef CLC_VSTOREA_HALF_NAME +#undef __CLC_VSTORE_TY +#undef __CLC_VSTORE_NAME +#undef __CLC_VSTORE_HALF_NAME +#undef __CLC_VSTOREA_HALF_NAME diff --git a/libclc/opencl/lib/spirv/math/fma.cl b/libclc/opencl/lib/spirv/math/fma.cl index 83504f63772c..0e328903ba26 100644 --- a/libclc/opencl/lib/spirv/math/fma.cl +++ b/libclc/opencl/lib/spirv/math/fma.cl @@ -9,9 +9,9 @@ #include #include -#define __FLOAT_ONLY -#define FUNCTION fma -#define __IMPL_FUNCTION(x) __clc_sw_fma +#define __CLC_FLOAT_ONLY +#define __CLC_FUNCTION fma +#define __CLC_IMPL_FUNCTION(x) __clc_sw_fma #define __CLC_BODY #include diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake index 699d3f886686..d4ce32ce5b17 100644 --- a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake +++ b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake @@ -5,5 +5,6 @@ set(_defines _LIBCPP_ABI_BOUNDED_ITERATORS_IN_VECTOR _LIBCPP_ABI_BOUNDED_UNIQUE_PTR _LIBCPP_ABI_BOUNDED_ITERATORS_IN_STD_ARRAY + _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL ) set(LIBCXX_ABI_DEFINES "${_defines}" CACHE STRING "") diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index a36848ebd24b..358889d8dbc3 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -480,7 +480,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_not_fn`` ``202306L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_optional_range_support`` *unimplemented* + ``__cpp_lib_optional_range_support`` ``202406L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_out_ptr`` ``202311L`` ---------------------------------------------------------- ----------------- diff --git a/libcxx/docs/ReleaseNotes/22.rst b/libcxx/docs/ReleaseNotes/22.rst index 191dab6b7756..f28babf548fe 100644 --- a/libcxx/docs/ReleaseNotes/22.rst +++ b/libcxx/docs/ReleaseNotes/22.rst @@ -39,6 +39,7 @@ Implemented Papers ------------------ - P2321R2: ``zip`` (`Github `__) (The paper is partially implemented. ``zip_transform_view`` is implemented in this release) +- P3168R2: Give ``std::optional`` Range Support (`Github `__) Improvements and New Features ----------------------------- diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv index 189f8452e067..d1546f4a452b 100644 --- a/libcxx/docs/Status/Cxx23Issues.csv +++ b/libcxx/docs/Status/Cxx23Issues.csv @@ -209,7 +209,7 @@ "`LWG3746 `__","``optional``'s spaceship with ``U`` with a type derived from optional causes infinite constraint meta-recursion","2022-11 (Kona)","|Complete|","17","" "`LWG3747 `__","``ranges::uninitialized_copy_n``, ``ranges::uninitialized_move_n``, and ``ranges::destroy_n`` should use ``std::move``","2022-11 (Kona)","","","" "`LWG3750 `__","Too many papers bump ``__cpp_lib_format``","2022-11 (Kona)","|Partial|","","Only ``__cpp_lib_format_ranges`` is fully implemented" -"`LWG3751 `__","Missing feature macro for ``flat_set``","2022-11 (Kona)","","","" +"`LWG3751 `__","Missing feature macro for ``flat_set``","2022-11 (Kona)","|Complete|","21","" "`LWG3753 `__","Clarify entity vs. freestanding entity","2022-11 (Kona)","","","" "`LWG3754 `__","Class template expected synopsis contains declarations that do not match the detailed description","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3755 `__","``tuple-for-each`` can call ``user-defined`` ``operator,``","2022-11 (Kona)","|Complete|","17","" @@ -223,14 +223,14 @@ "`LWG3766 `__","``view_interface::cbegin`` is underconstrained","2022-11 (Kona)","","","" "`LWG3770 `__","``const_sentinel_t`` is missing","2022-11 (Kona)","","","" "`LWG3773 `__","``views::zip_transform`` still requires ``F`` to be ``copy_constructible`` when empty pack","2022-11 (Kona)","|Complete|","22","" -"`LWG3774 `__","```` should include ````","2022-11 (Kona)","","","" +"`LWG3774 `__","```` should include ````","2022-11 (Kona)","|Complete|","21","" "`LWG3775 `__","Broken dependencies in the ``Cpp17Allocator`` requirements","2022-11 (Kona)","","","" "`LWG3778 `__","``vector`` missing exception specifications","2022-11 (Kona)","|Complete|","3.7","" "`LWG3781 `__","The exposition-only alias templates ``cont-key-type`` and ``cont-mapped-type`` should be removed","2022-11 (Kona)","|Nothing To Do|","","" "`LWG3782 `__","Should ```` declare ``::lerp``?","2022-11 (Kona)","|Complete|","17","" "`LWG3784 `__","std.compat should not provide ``::byte`` and its friends","2022-11 (Kona)","|Complete|","19","" "`LWG3785 `__","``ranges::to`` is over-constrained on the destination type being a range","2022-11 (Kona)","","","" -"`LWG3788 `__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment","2022-11 (Kona)","","","" +"`LWG3788 `__","``jthread::operator=(jthread&&)`` postconditions are unimplementable under self-assignment","2022-11 (Kona)","|Complete|","18","" "`LWG3792 `__","``__cpp_lib_constexpr_algorithms`` should also be defined in ````","2022-11 (Kona)","|Complete|","16","" "`LWG3795 `__","Self-move-assignment of ``std::future`` and ``std::shared_future`` have unimplementable postconditions","2022-11 (Kona)","","","" "`LWG3796 `__","``movable-box`` as member should use ``default-initialization`` instead of ``copy-initialization``","2022-11 (Kona)","","","" diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv index e8b0c9559f40..3b8b2b7ad0b3 100644 --- a/libcxx/docs/Status/Cxx2cPapers.csv +++ b/libcxx/docs/Status/Cxx2cPapers.csv @@ -66,7 +66,7 @@ "`P2747R2 `__","``constexpr`` placement new","2024-06 (St. Louis)","|Complete|","20","" "`P2997R1 `__","Removing the common reference requirement from the indirectly invocable concepts","2024-06 (St. Louis)","|Complete|","19","Implemented as a DR against C++20. (MSVC STL and libstdc++ will do the same.)" "`P2389R2 `__","``dextents`` Index Type Parameter","2024-06 (St. Louis)","|Complete|","19","" -"`P3168R2 `__","Give ``std::optional`` Range Support","2024-06 (St. Louis)","","","" +"`P3168R2 `__","Give ``std::optional`` Range Support","2024-06 (St. Louis)","|Complete|","22","" "`P3217R0 `__","Adjoints to 'Enabling list-initialization for algorithms': find_last","2024-06 (St. Louis)","","","" "`P2985R0 `__","A type trait for detecting virtual base classes","2024-06 (St. Louis)","|Complete|","20","" "`P0843R14 `__","``inplace_vector``","2024-06 (St. Louis)","","","" diff --git a/libcxx/include/__iterator/wrap_iter.h b/libcxx/include/__iterator/wrap_iter.h index 2b5bc489dd44..7610586ddecb 100644 --- a/libcxx/include/__iterator/wrap_iter.h +++ b/libcxx/include/__iterator/wrap_iter.h @@ -117,6 +117,8 @@ private: friend class span; template friend struct array; + template + friend class optional; }; template diff --git a/libcxx/include/iterator b/libcxx/include/iterator index d25fdfd2a8b3..fc8bdc5e6bcf 100644 --- a/libcxx/include/iterator +++ b/libcxx/include/iterator @@ -737,6 +737,16 @@ template constexpr const E* data(initializer_list il) noexcept; # include # include +// [range.access.general] +# if _LIBCPP_STD_VER >= 20 +# include <__ranges/access.h> +# include <__ranges/data.h> +# include <__ranges/empty.h> +# include <__ranges/rbegin.h> +# include <__ranges/rend.h> +# include <__ranges/size.h> +# endif + # if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header # endif diff --git a/libcxx/include/optional b/libcxx/include/optional index e81bff50daad..39fcaa2c2ec1 100644 --- a/libcxx/include/optional +++ b/libcxx/include/optional @@ -20,6 +20,11 @@ namespace std { template class optional; + template + constexpr bool ranges::enable_view> = true; + template + constexpr auto format_kind> = range_format::disabled; + template concept is-derived-from-optional = requires(const T& t) { // exposition only [](const optional&){ }(t); @@ -102,6 +107,8 @@ namespace std { class optional { public: using value_type = T; + using iterator = implementation-defined; // see [optional.iterators] + using const_iterator = implementation-defined; // see [optional.iterators] // [optional.ctor], constructors constexpr optional() noexcept; @@ -135,6 +142,12 @@ namespace std { // [optional.swap], swap void swap(optional &) noexcept(see below ); // constexpr in C++20 + // [optional.iterators], iterator support + constexpr iterator begin() noexcept; + constexpr const_iterator begin() const noexcept; + constexpr iterator end() noexcept; + constexpr const_iterator end() const noexcept; + // [optional.observe], observers constexpr T const *operator->() const noexcept; constexpr T *operator->() noexcept; @@ -186,13 +199,18 @@ namespace std { # include <__compare/three_way_comparable.h> # include <__concepts/invocable.h> # include <__config> +# include <__cstddef/ptrdiff_t.h> # include <__exception/exception.h> +# include <__format/range_format.h> # include <__functional/hash.h> # include <__functional/invoke.h> # include <__functional/unary_function.h> # include <__fwd/functional.h> +# include <__iterator/bounded_iter.h> +# include <__iterator/wrap_iter.h> # include <__memory/addressof.h> # include <__memory/construct_at.h> +# include <__ranges/enable_view.h> # include <__tuple/sfinae_helpers.h> # include <__type_traits/add_pointer.h> # include <__type_traits/conditional.h> @@ -207,6 +225,7 @@ namespace std { # include <__type_traits/is_convertible.h> # include <__type_traits/is_core_convertible.h> # include <__type_traits/is_destructible.h> +# include <__type_traits/is_function.h> # include <__type_traits/is_nothrow_assignable.h> # include <__type_traits/is_nothrow_constructible.h> # include <__type_traits/is_object.h> @@ -219,6 +238,7 @@ namespace std { # include <__type_traits/is_trivially_constructible.h> # include <__type_traits/is_trivially_destructible.h> # include <__type_traits/is_trivially_relocatable.h> +# include <__type_traits/is_unbounded_array.h> # include <__type_traits/negation.h> # include <__type_traits/remove_const.h> # include <__type_traits/remove_cv.h> @@ -567,6 +587,14 @@ using __optional_sfinae_assign_base_t _LIBCPP_NODEBUG = template class optional; +# if _LIBCPP_STD_VER >= 26 +template +constexpr bool ranges::enable_view> = true; + +template +constexpr range_format format_kind> = range_format::disabled; +# endif + # if _LIBCPP_STD_VER >= 20 template @@ -586,9 +614,21 @@ class _LIBCPP_DECLSPEC_EMPTY_BASES optional private __optional_sfinae_assign_base_t<_Tp> { using __base _LIBCPP_NODEBUG = __optional_move_assign_base<_Tp>; + using __pointer _LIBCPP_NODEBUG = std::add_pointer_t<_Tp>; + using __const_pointer _LIBCPP_NODEBUG = std::add_pointer_t; + public: using value_type = _Tp; +# if _LIBCPP_STD_VER >= 26 +# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL + using iterator = __bounded_iter<__wrap_iter<__pointer>>; + using const_iterator = __bounded_iter<__wrap_iter<__const_pointer>>; +# else + using iterator = __wrap_iter<__pointer>; + using const_iterator = __wrap_iter<__const_pointer>; +# endif +# endif using __trivially_relocatable _LIBCPP_NODEBUG = conditional_t<__libcpp_is_trivially_relocatable<_Tp>::value, optional, void>; using __replaceable _LIBCPP_NODEBUG = conditional_t<__is_replaceable_v<_Tp>, optional, void>; @@ -792,6 +832,34 @@ public: } } +# if _LIBCPP_STD_VER >= 26 + // [optional.iterators], iterator support + _LIBCPP_HIDE_FROM_ABI constexpr iterator begin() noexcept { +# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL + return std::__make_bounded_iter( + std::__wrap_iter<__pointer>(std::addressof(this->__get())), + std::__wrap_iter<__pointer>(std::addressof(this->__get())), + std::__wrap_iter<__pointer>(std::addressof(this->__get()) + (this->has_value() ? 1 : 0))); +# else + return iterator(std::addressof(this->__get())); +# endif + } + + _LIBCPP_HIDE_FROM_ABI constexpr const_iterator begin() const noexcept { +# ifdef _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL + return std::__make_bounded_iter( + std::__wrap_iter<__const_pointer>(std::addressof(this->__get())), + std::__wrap_iter<__const_pointer>(std::addressof(this->__get())), + std::__wrap_iter<__const_pointer>(std::addressof(this->__get()) + (this->has_value() ? 1 : 0))); +# else + return const_iterator(std::addressof(this->__get())); +# endif + } + + _LIBCPP_HIDE_FROM_ABI constexpr iterator end() noexcept { return begin() + (this->has_value() ? 1 : 0); } + _LIBCPP_HIDE_FROM_ABI constexpr const_iterator end() const noexcept { return begin() + (this->has_value() ? 1 : 0); } +# endif + _LIBCPP_HIDE_FROM_ABI constexpr add_pointer_t operator->() const noexcept { _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(this->has_value(), "optional operator-> called on a disengaged value"); return std::addressof(this->__get()); diff --git a/libcxx/include/version b/libcxx/include/version index aae9277a7dfc..16917a3bd9dd 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -585,7 +585,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_mdspan 202406L # undef __cpp_lib_not_fn # define __cpp_lib_not_fn 202306L -// # define __cpp_lib_optional_range_support 202406L +# define __cpp_lib_optional_range_support 202406L # undef __cpp_lib_out_ptr # define __cpp_lib_out_ptr 202311L // # define __cpp_lib_philox_engine 202406L diff --git a/libcxx/modules/std/optional.inc b/libcxx/modules/std/optional.inc index 0f812bc0e24a..9ee51117277c 100644 --- a/libcxx/modules/std/optional.inc +++ b/libcxx/modules/std/optional.inc @@ -10,7 +10,12 @@ export namespace std { // [optional.optional], class template optional using std::optional; - +#if _LIBCPP_STD_VER >= 26 + // [optional.iterators], iterator support + namespace ranges { + using std::ranges::enable_view; + } +#endif // [optional.nullopt], no-value state indicator using std::nullopt; using std::nullopt_t; @@ -18,6 +23,10 @@ export namespace std { // [optional.bad.access], class bad_optional_access using std::bad_optional_access; +#if _LIBCPP_STD_VER >= 26 + using std::format_kind; +#endif + // [optional.relops], relational operators using std::operator==; using std::operator!=; diff --git a/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp new file mode 100644 index 000000000000..3cdd7553e2e5 --- /dev/null +++ b/libcxx/test/libcxx/utilities/optional/optional.iterator/iterator.compile.pass.cpp @@ -0,0 +1,30 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// + +// template class optional::iterator; +// template class optional::const_iterator; + +#include + +template +concept has_iterator_aliases = requires { + typename T::iterator; + typename T::const_iterator; +}; + +static_assert(has_iterator_aliases>); +static_assert(has_iterator_aliases>); + +// TODO: Uncomment these once P2988R12 is implemented, as they would be testing optional + +// static_assert(!has_iterator_aliases>); +// static_assert(!has_iterator_aliases>); diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp index ccdb1a8c11a0..aca6290f5a4b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/optional.version.compile.pass.cpp @@ -146,17 +146,11 @@ # error "__cpp_lib_optional should have the value 202110L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_optional_range_support -# error "__cpp_lib_optional_range_support should be defined in c++26" -# endif -# if __cpp_lib_optional_range_support != 202406L -# error "__cpp_lib_optional_range_support should have the value 202406L in c++26" -# endif -# else -# ifdef __cpp_lib_optional_range_support -# error "__cpp_lib_optional_range_support should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_optional_range_support +# error "__cpp_lib_optional_range_support should be defined in c++26" +# endif +# if __cpp_lib_optional_range_support != 202406L +# error "__cpp_lib_optional_range_support should have the value 202406L in c++26" # endif #endif // TEST_STD_VER > 23 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 7bd8e8979e6f..cde2f258b773 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -7437,17 +7437,11 @@ # error "__cpp_lib_optional should have the value 202110L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_optional_range_support -# error "__cpp_lib_optional_range_support should be defined in c++26" -# endif -# if __cpp_lib_optional_range_support != 202406L -# error "__cpp_lib_optional_range_support should have the value 202406L in c++26" -# endif -# else -# ifdef __cpp_lib_optional_range_support -# error "__cpp_lib_optional_range_support should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_optional_range_support +# error "__cpp_lib_optional_range_support should be defined in c++26" +# endif +# if __cpp_lib_optional_range_support != 202406L +# error "__cpp_lib_optional_range_support should have the value 202406L in c++26" # endif # ifndef __cpp_lib_out_ptr diff --git a/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp b/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp index 678483b9b2f2..49497875dcf9 100644 --- a/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp +++ b/libcxx/test/std/library/description/conventions/customization.point.object/cpo.compile.pass.cpp @@ -81,6 +81,10 @@ static_assert(test(std::ranges::rend, a)); static_assert(test(std::ranges::size, a)); static_assert(test(std::ranges::ssize, a)); +#if TEST_STD_VER >= 26 +// static_assert(test(std::views::reserve_hint, a)); +#endif + // [range.factories] // views::empty is not a CPO static_assert(test(std::views::iota, 1)); diff --git a/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp b/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp new file mode 100644 index 000000000000..bb2cda0e4d90 --- /dev/null +++ b/libcxx/test/std/ranges/range.access/include.iterator.pass.cpp @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++20 + +// [range.access.general]/1: +// In addition to being available via inclusion of the header, the customization point objects in +// [range.access] are available when the header is included. + +#include +#include + +#include "test_macros.h" + +template +constexpr void test(CPO& o, Args&&... args) { + static_assert(std::is_const_v); + static_assert(std::is_class_v); + static_assert(std::is_trivially_copyable_v); + static_assert(std::is_trivially_default_constructible_v); + + auto p = o; + using T = decltype(p); + (void)o(args...); // to make sure the CPO can actually be used + + // The type of a customization point object, ignoring cv-qualifiers, shall model semiregular. + static_assert(std::semiregular); + + // The type T of a customization point object, ignoring cv-qualifiers, shall model... + static_assert(std::invocable); + static_assert(std::invocable); + static_assert(std::invocable); + static_assert(std::invocable); +} + +int a[10]; + +constexpr bool test() { + test(std::ranges::begin, a); + test(std::ranges::end, a); + test(std::ranges::cbegin, a); + test(std::ranges::cdata, a); + test(std::ranges::cend, a); + test(std::ranges::crbegin, a); + test(std::ranges::crend, a); + test(std::ranges::data, a); + test(std::ranges::empty, a); + test(std::ranges::rbegin, a); + test(std::ranges::rend, a); + test(std::ranges::size, a); + test(std::ranges::ssize, a); + +#if TEST_STD_VER >= 26 + // test(std::views::reserve_hint, a); +#endif + + return true; +} + +int main() { + test(); + static_assert(test()); +} diff --git a/libcxx/test/std/thread/thread.jthread/assign.move.pass.cpp b/libcxx/test/std/thread/thread.jthread/assign.move.pass.cpp index b714cc58cbd3..fd5a1705c56a 100644 --- a/libcxx/test/std/thread/thread.jthread/assign.move.pass.cpp +++ b/libcxx/test/std/thread/thread.jthread/assign.move.pass.cpp @@ -112,5 +112,14 @@ int main(int, char**) { assert(j1.get_id() == j2Id); } + // LWG3788: self-assignement + { + std::jthread j = support::make_test_jthread([] {}); + auto oldId = j.get_id(); + j = std::move(j); + + assert(j.get_id() == oldId); + } + return 0; } diff --git a/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp new file mode 100644 index 000000000000..df95a8df3793 --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.iterator/begin.pass.cpp @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// + +// constexpr iterator optional::begin() noexcept; +// constexpr const_iterator optional::begin() const noexcept; + +#include +#include +#include +#include +#include + +template +constexpr bool test() { + std::optional opt{T{}}; + + { // begin() is marked noexcept + static_assert(noexcept(opt.begin())); + static_assert(noexcept(std::as_const(opt).begin())); + } + + { // Dereferencing an iterator at the beginning == indexing the 0th element, and that calling begin() again return the same iterator. + auto iter1 = opt.begin(); + auto iter2 = std::as_const(opt).begin(); + assert(*iter1 == iter1[0]); + assert(*iter2 == iter2[0]); + assert(iter1 == opt.begin()); + assert(iter2 == std::as_const(opt).begin()); + } + + { // Calling begin() multiple times on a disengaged optional returns the same iterator. + std::optional disengaged{std::nullopt}; + auto iter1 = disengaged.begin(); + auto iter2 = std::as_const(disengaged).begin(); + assert(iter1 == disengaged.begin()); + assert(iter2 == std::as_const(disengaged).begin()); + } + + return true; +} + +constexpr bool tests() { + assert(test()); + assert(test()); + assert(test()); + assert(test()); + return true; +} + +int main(int, char**) { + assert(tests()); + static_assert(tests()); + + return 0; +} diff --git a/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp new file mode 100644 index 000000000000..966c3e744188 --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.iterator/end.pass.cpp @@ -0,0 +1,74 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// + +// constexpr iterator optional::end() noexcept; +// constexpr const_iterator optional::end() const noexcept; + +#include +#include +#include +#include +#include + +template +constexpr bool test() { + std::optional disengaged{std::nullopt}; + + { // end() is marked noexcept + static_assert(noexcept(disengaged.end())); + static_assert(noexcept(std::as_const(disengaged).end())); + } + + { // end() == begin() and end() == end() if the optional is disengaged + auto it = disengaged.end(); + auto it2 = std::as_const(disengaged).end(); + + assert(it == disengaged.begin()); + assert(disengaged.begin() == it); + assert(it == disengaged.end()); + + assert(it2 == std::as_const(disengaged).begin()); + assert(std::as_const(disengaged).begin() == it2); + assert(it2 == std::as_const(disengaged).end()); + } + + std::optional engaged{T{}}; + + { // end() != begin() if the optional is engaged + auto it = engaged.end(); + auto it2 = std::as_const(engaged).end(); + + assert(it != engaged.begin()); + assert(engaged.begin() != it); + + assert(it2 != std::as_const(engaged).begin()); + assert(std::as_const(engaged).begin() != it2); + } + + return true; +} + +constexpr bool tests() { + assert(test()); + assert(test()); + assert(test()); + assert(test()); + + return true; +} + +int main(int, char**) { + assert(tests()); + static_assert(tests()); + + return 0; +} diff --git a/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp b/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp new file mode 100644 index 000000000000..1203290a0290 --- /dev/null +++ b/libcxx/test/std/utilities/optional/optional.iterator/iterator.pass.cpp @@ -0,0 +1,98 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++26 + +// + +// template class optional::iterator; +// template class optional::const_iterator; + +#include +#include +#include +#include +#include +#include + +template +constexpr bool test() { + std::optional opt{__val}; + + { // Dereferencing an iterator of an engaged optional will return the same value that the optional holds. + auto it = opt.begin(); + auto it2 = std::as_const(opt).begin(); + assert(*it == *opt); + assert(*it2 == *std::as_const(opt)); + } + + { // optional::iterator and optional::const_iterator satisfy the Cpp17RandomAccessIterator and contiguous iterator. + auto it = opt.begin(); + auto it2 = std::as_const(opt).begin(); + assert(std::contiguous_iterator); + assert(std::contiguous_iterator); + + assert(std::random_access_iterator); + assert(std::random_access_iterator); + } + + { // const_iterator::value_type == std::remove_cv_t, const_iterator::reference == const T&, iterator::value_type = std::remove_cv_t, iterator::reference == T& + auto it = opt.begin(); + auto it2 = std::as_const(opt).begin(); + assert((std::is_same_v>)); + assert((std::is_same_v)); + assert((std::is_same_v>)); + assert((std::is_same_v)); + } + + { // std::ranges::size for an engaged optional == 1, disengaged optional == 0 + const std::optional disengaged{std::nullopt}; + std::optional disengaged2{std::nullopt}; + assert(std::ranges::size(opt) == 1); + assert(std::ranges::size(std::as_const(opt)) == 1); + + assert(std::ranges::size(disengaged) == 0); + assert(std::ranges::size(disengaged2) == 0); + } + + { // std::ranges::enable_view> == true, and std::format_kind> == true + static_assert(std::ranges::enable_view> == true); + static_assert(std::format_kind> == std::range_format::disabled); + } + + // An optional with value that is reset will have a begin() == end(), then when it is reassigned a value, + // begin() != end(), and *begin() will contain the new value. + { + std::optional val{__val}; + assert(val.begin() != val.end()); + val.reset(); + assert(val.begin() == val.end()); + val.emplace(__val); + assert(val.begin() != val.end()); + assert(*(val.begin()) == __val); + } + + return true; +} + +constexpr bool tests() { + assert((test())); + assert((test())); + assert((test())); + assert((test())); + assert((test())); + + return true; +} + +int main(int, char**) { + assert(tests()); + static_assert(tests()); + + return 0; +} diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index d9317e00e3f4..8d57a07b8836 100644 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -1012,7 +1012,6 @@ feature_test_macros = [ "name": "__cpp_lib_optional_range_support", "values": {"c++26": 202406}, # P3168R2 Give std::optional Range Support "headers": ["optional"], - "unimplemented": True, }, { "name": "__cpp_lib_out_ptr", diff --git a/lldb/include/lldb/Target/StackFrame.h b/lldb/include/lldb/Target/StackFrame.h index 3f51c9a7f22f..d4104bfe49d2 100644 --- a/lldb/include/lldb/Target/StackFrame.h +++ b/lldb/include/lldb/Target/StackFrame.h @@ -241,8 +241,9 @@ public: return m_reg_context_sp; } - /// Retrieve the list of variables that are in scope at this StackFrame's - /// pc. + /// Retrieve the list of variables whose scope either: + /// * contains this StackFrame's pc, + /// * is a child of this StackFrame's current scope. /// /// A frame that is not live may return an empty VariableList for a given /// pc value even though variables would be available at this point if it @@ -274,6 +275,9 @@ public: /// that are visible to the entire compilation unit (e.g. file /// static in C, globals that are homed in this CU). /// + /// \param[in] must_have_valid_location + /// Whether to filter variables whose location is not available at this + /// StackFrame's pc. /// \return /// A pointer to a list of variables. lldb::VariableListSP diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index 3118ff151d1c..b4207439f528 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -105,7 +105,9 @@ CPlusPlusLanguage::GetFunctionNameInfo(ConstString name) const { bool CPlusPlusLanguage::SymbolNameFitsToLanguage(Mangled mangled) const { const char *mangled_name = mangled.GetMangledName().GetCString(); - return mangled_name && Mangled::IsMangledName(mangled_name); + auto mangling_scheme = Mangled::GetManglingScheme(mangled_name); + return mangled_name && (mangling_scheme == Mangled::eManglingSchemeItanium || + mangling_scheme == Mangled::eManglingSchemeMSVC); } ConstString CPlusPlusLanguage::GetDemangledFunctionNameWithoutArguments( diff --git a/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp b/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp index 6eeb4f54952b..957fb3f60049 100644 --- a/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp +++ b/lldb/unittests/Language/CPlusPlus/CPlusPlusLanguageTest.cpp @@ -397,3 +397,33 @@ TEST(CPlusPlusLanguage, CPlusPlusNameParser) { // Don't crash. CPlusPlusNameParser((const char *)nullptr); } + +TEST(CPlusPlusLanguage, DoesNotMatchCxx) { + // Test that a symbol name that is NOT C++ does not match C++. + + SubsystemRAII lang; + Language *CPlusPlusLang = + Language::FindPlugin(lldb::eLanguageTypeC_plus_plus); + + EXPECT_TRUE(CPlusPlusLang != nullptr); + + Mangled swiftSymbol("$sS"); + EXPECT_FALSE(CPlusPlusLang->SymbolNameFitsToLanguage(swiftSymbol)); +} + +TEST(CPlusPlusLanguage, MatchesCxx) { + // Test that a symbol name that is C++ does match C++ (both Itanium and MSVC). + + SubsystemRAII lang; + Language *CPlusPlusLang = + Language::FindPlugin(lldb::eLanguageTypeC_plus_plus); + + EXPECT_TRUE(CPlusPlusLang != nullptr); + + Mangled itaniumSymbol("_Z3Foo"); + EXPECT_TRUE(CPlusPlusLang->SymbolNameFitsToLanguage(itaniumSymbol)); + Mangled itaniumExtensionSymbol("___Z3Bar_block_invoke"); + EXPECT_TRUE(CPlusPlusLang->SymbolNameFitsToLanguage(itaniumExtensionSymbol)); + Mangled msvcSymbol("??x@@3AH"); + EXPECT_TRUE(CPlusPlusLang->SymbolNameFitsToLanguage(msvcSymbol)); +} diff --git a/llvm/benchmarks/CMakeLists.txt b/llvm/benchmarks/CMakeLists.txt index 9613678d2e0a..e32aba5ebff4 100644 --- a/llvm/benchmarks/CMakeLists.txt +++ b/llvm/benchmarks/CMakeLists.txt @@ -11,20 +11,6 @@ add_benchmark(FormatVariadicBM FormatVariadicBM.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(GetIntrinsicInfoTableEntriesBM GetIntrinsicInfoTableEntriesBM.cpp PARTIAL_SOURCES_INTENDED) add_benchmark(SandboxIRBench SandboxIRBench.cpp PARTIAL_SOURCES_INTENDED) -# Extract the list of symbols in a random utility as sample data. -set(SYMBOL_TEST_DATA_FILE "sample_symbol_list.txt") -set(SYMBOL_TEST_DATA_SOURCE_BINARY $) - -add_custom_command(OUTPUT ${SYMBOL_TEST_DATA_FILE} - COMMAND $ --no-demangle --no-sort - --format=just-symbols - ${SYMBOL_TEST_DATA_SOURCE_BINARY} > ${SYMBOL_TEST_DATA_FILE} - DEPENDS "$" "$") - -add_custom_target(generate-runtime-libcalls-sample-symbol-list - DEPENDS ${SYMBOL_TEST_DATA_FILE}) add_benchmark(RuntimeLibcallsBench RuntimeLibcalls.cpp PARTIAL_SOURCES_INTENDED) -add_dependencies(RuntimeLibcallsBench generate-runtime-libcalls-sample-symbol-list) -target_compile_definitions(RuntimeLibcallsBench PRIVATE - -DSYMBOL_TEST_DATA_FILE="${CMAKE_CURRENT_BINARY_DIR}/${SYMBOL_TEST_DATA_FILE}") + diff --git a/llvm/benchmarks/RuntimeLibcalls.cpp b/llvm/benchmarks/RuntimeLibcalls.cpp index 81a5a24ec8f9..9ac77bb74a3d 100644 --- a/llvm/benchmarks/RuntimeLibcalls.cpp +++ b/llvm/benchmarks/RuntimeLibcalls.cpp @@ -43,6 +43,7 @@ static std::vector getRandomFuncNames() { return TestFuncNames; } +#ifdef SYMBOL_TEST_DATA_FILE static std::vector readSymbolsFromFile(StringRef InputFile) { auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFile, /*IsText=*/true); if (!BufOrError) { @@ -69,6 +70,7 @@ static std::vector readSymbolsFromFile(StringRef InputFile) { } return Lines; } +#endif static void BM_LookupRuntimeLibcallByNameKnownCalls(benchmark::State &State) { std::vector Names = getLibcallNameStringRefs(); @@ -93,6 +95,7 @@ static void BM_LookupRuntimeLibcallByNameRandomCalls(benchmark::State &State) { } } +#ifdef SYMBOL_TEST_DATA_FILE // This isn't fully representative, it doesn't include any anonymous functions. // nm -n --no-demangle --format=just-symbols sample-binary > sample.txt static void BM_LookupRuntimeLibcallByNameSampleData(benchmark::State &State) { @@ -106,9 +109,13 @@ static void BM_LookupRuntimeLibcallByNameSampleData(benchmark::State &State) { } } } +#endif BENCHMARK(BM_LookupRuntimeLibcallByNameKnownCalls); BENCHMARK(BM_LookupRuntimeLibcallByNameRandomCalls); + +#ifdef SYMBOL_TEST_DATA_FILE BENCHMARK(BM_LookupRuntimeLibcallByNameSampleData); +#endif BENCHMARK_MAIN(); diff --git a/llvm/cmake/modules/HandleLLVMStdlib.cmake b/llvm/cmake/modules/HandleLLVMStdlib.cmake index dda1caa846dc..a7e138aa0789 100644 --- a/llvm/cmake/modules/HandleLLVMStdlib.cmake +++ b/llvm/cmake/modules/HandleLLVMStdlib.cmake @@ -2,7 +2,6 @@ # if the user has requested it. include(DetermineGCCCompatible) -include(CheckIncludeFiles) if(NOT DEFINED LLVM_STDLIB_HANDLED) set(LLVM_STDLIB_HANDLED ON) @@ -20,17 +19,7 @@ if(NOT DEFINED LLVM_STDLIB_HANDLED) if(LLVM_COMPILER_IS_GCC_COMPATIBLE) check_cxx_compiler_flag("-stdlib=libc++" CXX_COMPILER_SUPPORTS_STDLIB) check_linker_flag(CXX "-stdlib=libc++" CXX_LINKER_SUPPORTS_STDLIB) - - # Check whether C++ include files are available - # runtimes/CMakeLists.txt adds -nostdlib++ and -nostdinc++ to - # CMAKE_REQUIRED_FLAGS, which are incompatible with -stdlib=libc++; use - # a fresh CMAKE_REQUIRED_FLAGS environment. - cmake_push_check_state(RESET) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -stdlib=libc++") - check_include_files("chrono" CXX_COMPILER_SUPPORTS_STDLIB_CHRONO LANGUAGE CXX) - cmake_pop_check_state() - - if(CXX_COMPILER_SUPPORTS_STDLIB AND CXX_LINKER_SUPPORTS_STDLIB AND CXX_COMPILER_SUPPORTS_STDLIB_CHRONO) + if(CXX_COMPILER_SUPPORTS_STDLIB AND CXX_LINKER_SUPPORTS_STDLIB) append("-stdlib=libc++" CMAKE_CXX_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS) diff --git a/llvm/cmake/modules/LLVMConfig.cmake.in b/llvm/cmake/modules/LLVMConfig.cmake.in index c39c33f0c779..c15b9576cd5d 100644 --- a/llvm/cmake/modules/LLVMConfig.cmake.in +++ b/llvm/cmake/modules/LLVMConfig.cmake.in @@ -55,8 +55,6 @@ endif() set(LLVM_ENABLE_RTTI @LLVM_ENABLE_RTTI@) -set(LLVM_ENABLE_LIBCXX @LLVM_ENABLE_LIBCXX@) - set(LLVM_ENABLE_LIBEDIT @HAVE_LIBEDIT@) if(LLVM_ENABLE_LIBEDIT) find_package(LibEdit) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 1aebcc443996..a71eefd1eb68 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -413,7 +413,7 @@ added in the future: - On AArch64 the callee preserves all general purpose registers, except X0-X8 and X16-X18. Not allowed with ``nest``. - - On RISC-V the callee preserve x5-x31 except x6, x7 and x28 registers. + - On RISC-V the callee preserves x5-x31 except x6, x7 and x28 registers. The idea behind this convention is to support calls to runtime functions that have a hot path and a cold path. The hot path is usually a small piece @@ -575,7 +575,7 @@ DLL storage classes: and the function or variable name. On XCOFF targets, ``dllexport`` indicates that the symbol will be made visible to other modules using "exported" visibility and thus placed by the linker in the loader section symbol table. - Since this storage class exists for defining a dll interface, the compiler, + Since this storage class exists for defining a DLL interface, the compiler, assembler and linker know it is externally referenced and must refrain from deleting the symbol. @@ -1887,7 +1887,7 @@ Attribute Groups Attribute groups are groups of attributes that are referenced by objects within the IR. They are important for keeping ``.ll`` files readable, because a lot of -functions will use the same set of attributes. In the degenerative case of a +functions will use the same set of attributes. In the degenerate case of a ``.ll`` file that corresponds to a single ``.c`` file, the single attribute group will capture the important command line flags used to build that file. @@ -1946,8 +1946,8 @@ For example: ``::operator::delete``. Matching malloc/realloc/free calls within a family can be optimized, but mismatched ones will be left alone. ``allockind("KIND")`` - Describes the behavior of an allocation function. The KIND string contains comma - separated entries from the following options: + Describes the behavior of an allocation function. The KIND string contains + comma-separated entries from the following options: * "alloc": the function returns a new block of memory or null. * "realloc": the function returns a new block of memory or null. If the @@ -2047,7 +2047,7 @@ For example: even if this attribute says the frame pointer can be eliminated. The allowed string values are: - * ``"none"`` (default) - the frame pointer can be eliminated, and it's + * ``"none"`` (default) - the frame pointer can be eliminated, and its register can be used for other purposes. * ``"reserved"`` - the frame pointer register must either be updated to point to a valid frame record for the current function, or not be @@ -2201,7 +2201,7 @@ For example: A ``nofree`` function is explicitly allowed to free memory which it allocated or (if not ``nosync``) arrange for another thread to free - memory on it's behalf. As a result, perhaps surprisingly, a ``nofree`` + memory on its behalf. As a result, perhaps surprisingly, a ``nofree`` function can return a pointer to a previously deallocated :ref:`allocated object`. ``noimplicitfloat`` @@ -2232,14 +2232,14 @@ For example: may make calls to the function faster, at the cost of extra program startup time if the function is not called during program startup. ``noprofile`` - This function attribute prevents instrumentation based profiling, used for + This function attribute prevents instrumentation-based profiling, used for coverage or profile based optimization, from being added to a function. It also blocks inlining if the caller and callee have different values of this attribute. ``skipprofile`` - This function attribute prevents instrumentation based profiling, used for + This function attribute prevents instrumentation-based profiling, used for coverage or profile based optimization, from being added to a function. This - attribute does not restrict inlining, so instrumented instruction could end + attribute does not restrict inlining, so instrumented instructions could end up in this function. ``noredzone`` This attribute indicates that the code generator should not use a @@ -2339,7 +2339,7 @@ For example: * ``"prologue-short-redirect"`` - This style of patchable function is intended to support patching a function prologue to - redirect control away from the function in a thread safe + redirect control away from the function in a thread-safe manner. It guarantees that the first instruction of the function will be large enough to accommodate a short jump instruction, and will be sufficiently aligned to allow being @@ -2584,7 +2584,7 @@ For example: ``uwtable[(sync|async)]`` This attribute indicates that the ABI being targeted requires that an unwind table entry be produced for this function even if we can - show that no exceptions passes by it. This is normally the case for + show that no exceptions pass by it. This is normally the case for the ELF x86-64 abi, but it can be disabled for some compilation units. The optional parameter describes what kind of unwind tables to generate: ``sync`` for normal unwind tables, ``async`` for asynchronous @@ -2599,7 +2599,7 @@ For example: ``shadowcallstack`` This attribute indicates that the ShadowCallStack checks are enabled for the function. The instrumentation checks that the return address for the - function has not changed between the function prolog and epilog. It is + function has not changed between the function prologue and epilogue. It is currently x86_64-specific. .. _langref_mustprogress: @@ -2807,7 +2807,7 @@ operand bundle tag. These operand bundles represent an alternate "safe" continuation for the call site they're attached to, and can be used by a suitable runtime to deoptimize the compiled frame at the specified call site. There can be at most one ``"deopt"`` operand -bundle attached to a call site. Exact details of deoptimization is +bundle attached to a call site. Exact details of deoptimization are out of scope for the language reference, but it usually involves rewriting a compiled frame into a set of interpreted frames. @@ -2896,7 +2896,7 @@ generated code. For more details, see :ref:`GC Transitions The bundle contains an arbitrary list of Values which need to be passed to GC transition code. They will be lowered and passed as operands to -the appropriate GC_TRANSITION nodes in the selection DAG. It is assumed +the appropriate ``GC_TRANSITION`` nodes in the selection DAG. It is assumed that these arguments must be available before and after (but not necessarily during) the execution of the callee. @@ -3334,7 +3334,7 @@ by the minus sign character ('-'). The canonical forms are: This information is passed along to the backend so that it generates code for the proper architecture. It's possible to override this on the -command line with the ``-mtriple`` command line option. +command line with the ``-mtriple`` command-line option. .. _allocatedobjects: @@ -4289,7 +4289,7 @@ X86_amx Type :Overview: The x86_amx type represents a value held in an AMX tile register on an x86 -machine. The operations allowed on it are quite limited. Only few intrinsics +machine. The operations allowed on it are quite limited. Only a few intrinsics are allowed: stride load and store, zero and dot product. No instruction is allowed for this type. There are no arguments, arrays, pointers, vectors or constants of this type. @@ -5058,14 +5058,14 @@ Addresses of Basic Blocks The '``blockaddress``' constant computes the address of the specified basic block in the specified function. -It always has an ``ptr addrspace(P)`` type, where ``P`` is the address space +It always has a ``ptr addrspace(P)`` type, where ``P`` is the address space of the function containing ``%block`` (usually ``addrspace(0)``). Taking the address of the entry block is illegal. This value only has defined behavior when used as an operand to the ':ref:`indirectbr `' or for comparisons against null. Pointer -equality tests between labels addresses results in undefined behavior --- +equality tests between label addresses results in undefined behavior --- though, again, comparison against null is ok, and no label is equal to the null pointer. This may be passed around as an opaque pointer sized value as long as the bits are not inspected. This allows ``ptrtoint`` and arithmetic to be @@ -5098,7 +5098,7 @@ The target function may not have ``extern_weak`` linkage. to the function. - ``dso_local_equivalent`` can be implemented with a stub that tail-calls the function. Many targets support relocations that resolve at link time to either - a function or a stub for it, depending on if the function is defined within the + a function or a stub for it, depending on whether the function is defined within the linkage unit; LLVM will use this when available. (This is commonly called a "PLT stub".) On other targets, the stub may need to be emitted explicitly. @@ -5320,7 +5320,7 @@ the '``unwind``' keyword, the behavior is undefined. If multiple keywords appear, the '``sideeffect``' keyword must come first, the '``alignstack``' keyword second, the '``inteldialect``' keyword -third and the '``unwind``' keyword last. +third, and the '``unwind``' keyword last. Inline Asm Constraint String ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -5483,7 +5483,7 @@ followed by two letters (e.g. "``^wc``"), or "``{``" register-name "``}``" The one and two letter constraint codes are typically chosen to be the same as GCC's constraint codes. -A single constraint may include one or more than constraint code in it, leaving +A single constraint may include one or more constraint codes in it, leaving it up to LLVM to choose which one to use. This is included mainly for compatibility with the translation of GCC inline asm coming from clang. @@ -6028,7 +6028,7 @@ Inline Asm Metadata The call instructions that wrap inline asm nodes may have a "``!srcloc``" MDNode attached to it that contains a list of constant integers. If present, the code generator will use the integer as the -location cookie value when report errors through the ``LLVMContext`` +location cookie value when reporting errors through the ``LLVMContext`` error reporting mechanisms. This allows a front-end to correlate backend errors that occur with inline asm back to the source code that produced it. For example: @@ -6209,7 +6209,7 @@ Unlike instructions, global objects (functions and global variables) may have multiple metadata attachments with the same identifier. A transformation is required to drop any metadata attachment that it -does not know or know it can't preserve. Currently there is an +does not recognize or cannot preserve. Currently there is an exception for metadata attachment to globals for ``!func_sanitize``, ``!type``, ``!absolute_symbol`` and ``!associated`` which can't be unconditionally dropped unless the global is itself deleted. @@ -6460,7 +6460,7 @@ pointer and pointee is called association. The optional array is currently associated. The optional ``allocated`` is a ``DIExpression`` that describes whether the allocatable array is currently allocated. The optional ``rank`` is a ``DIExpression`` that describes the -rank (number of dimensions) of fortran assumed rank array (rank is +rank (number of dimensions) of Fortran assumed rank array (rank is known at runtime). The optional ``bitStride`` is an unsigned constant that describes the number of bits occupied by an element of the array; this is only needed if it differs from the element type's natural @@ -6776,7 +6776,7 @@ The current supported opcode vocabulary is limited: - ``DW_OP_plus_uconst, 93`` adds ``93`` to the working expression. - ``DW_OP_LLVM_fragment, 16, 8`` specifies the offset and size (``16`` and ``8`` here, respectively) of the variable fragment from the working expression. Note - that contrary to DW_OP_bit_piece, the offset is describing the location + that contrary to ``DW_OP_bit_piece``, the offset is describing the location within the described source variable. - ``DW_OP_LLVM_convert, 16, DW_ATE_signed`` specifies a bit size and encoding (``16`` and ``DW_ATE_signed`` here, respectively) to which the top of the @@ -6844,9 +6844,9 @@ The current supported opcode vocabulary is limited: expression over two registers. - ``DW_OP_push_object_address`` pushes the address of the object which can then serve as a descriptor in subsequent calculation. This opcode can be used to - calculate bounds of fortran allocatable array which has array descriptors. + calculate bounds of an Fortran allocatable array which has array descriptors. - ``DW_OP_over`` duplicates the entry currently second in the stack at the top - of the stack. This opcode can be used to calculate bounds of fortran assumed + of the stack. This opcode can be used to calculate bounds of a Fortran assumed rank array which has rank known at run time and current dimension number is implicitly first element of the stack. - ``DW_OP_LLVM_implicit_pointer`` It specifies the dereferenced value. It can @@ -7447,7 +7447,7 @@ For example, in the code below, the call instruction may only target the ``callback`` metadata may be attached to a function declaration, or definition. (Call sites are excluded only due to the lack of a use case.) For ease of -exposition, we'll refer to the function annotated w/ metadata as a broker +exposition, we'll refer to the function annotated with metadata as a broker function. The metadata describes how the arguments of a call to the broker are in turn passed to the callback function specified by the metadata. Thus, the ``callback`` metadata provides a partial description of a call site inside the @@ -7616,7 +7616,7 @@ loop is transformed to a different loop before an explicitly requested other transformations impossible. Mandatory loop canonicalizations such as loop rotation are still applied. -It is recommended to use this metadata in addition to any llvm.loop.* +It is recommended to use this metadata in addition to any ``llvm.loop.*`` transformation directive. Also, any loop should have at most one directive applied to it (and a sequence of transformations built using followup-attributes). Otherwise, which transformation will be applied @@ -7962,7 +7962,7 @@ the non-distributed fallback version will have. See '``llvm.loop.distribute.followup_all``' Metadata ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The attributes in this metadata is added to all followup loops of the +The attributes in this metadata are added to all followup loops of the loop distribution pass. See :ref:`Transformation Metadata ` for details. @@ -8041,8 +8041,8 @@ undefined. Note that if not all memory access instructions belong to an access group referred to by ``llvm.loop.parallel_accesses``, then the loop must not be considered trivially parallel. Additional -memory dependence analysis is required to make that determination. As a fail -safe mechanism, this causes loops that were originally parallel to be considered +memory dependence analysis is required to make that determination. As a +fail-safe mechanism, this causes loops that were originally parallel to be considered sequential (if optimization passes that are unaware of the parallel semantics insert new memory instructions into the loop body). @@ -8211,7 +8211,7 @@ compatibility, globals carrying this metadata should: - Be in ``@llvm.compiler.used``. - If the referenced global variable is in a comdat, be in the same comdat. -``!associated`` can not express many-to-one relationship. A global variable with +``!associated`` can not express a many-to-one relationship. A global variable with the metadata should generally not be referenced by a function: the function may be inlined into other functions, leading to more references to the metadata. Ideally we would want to keep metadata alive as long as any inline location is @@ -8272,7 +8272,7 @@ VP VP (value profile) metadata can be attached to instructions that have value profile information. Currently this is indirect calls (where it -records the hottest callees) and calls to memory intrinsics such as memcpy, +records the hottest callees) and calls to memory intrinsics, such as memcpy, memmove, and memset (where it records the hottest byte lengths). Each VP metadata node contains "VP" string, then a ``uint32_t`` value for the value @@ -8476,8 +8476,8 @@ Example: This is intended for use on targets with a notion of generic address spaces, which at runtime resolve to different physical memory -spaces. The interpretation of the address space values is target -specific. The behavior is undefined if the runtime memory address does +spaces. The interpretation of the address space values is target specific. +The behavior is undefined if the runtime memory address does resolve to an object defined in one of the indicated address spaces. @@ -8488,7 +8488,7 @@ Information about the module as a whole is difficult to convey to LLVM's subsystems. The LLVM IR isn't sufficient to transmit this information. The ``llvm.module.flags`` named metadata exists in order to facilitate this. These flags are in the form of key / value pairs --- much like a -dictionary --- making it easy for any subsystem who cares about a flag to +dictionary --- making it easy for any subsystem that cares about a flag to look it up. The ``llvm.module.flags`` metadata contains a list of metadata triplets. @@ -8748,7 +8748,7 @@ Automatic Linker Flags Named Metadata Some targets support embedding of flags to the linker inside individual object files. Typically this is used in conjunction with language extensions which -allow source files to contain linker command line options, and have these +allow source files to contain linker command-line options, and have these automatically be transmitted to the linker via object files. These flags are encoded in the IR using named metadata with the name @@ -11739,7 +11739,7 @@ size of the '' type. Note that this default alignment assumption is different from the alignment used for the load/store instructions when align isn't specified. -A ``atomicrmw`` instruction can also take an optional +An ``atomicrmw`` instruction can also take an optional ":ref:`syncscope `" argument. Semantics: @@ -12510,7 +12510,7 @@ Semantics: """""""""" The '``ptrtoint``' instruction converts ``value`` to integer type -``ty2`` by interpreting the all pointer representation bits as an integer +``ty2`` by interpreting all the pointer representation bits as an integer (equivalent to a ``bitcast``) and either truncating or zero extending that value to the size of the integer type. If ``value`` is smaller than ``ty2`` then a zero extension is done. If @@ -13542,7 +13542,7 @@ ensures that each ``catchpad`` has exactly one predecessor block, and it always terminates in a ``catchswitch``. The ``args`` correspond to whatever information the personality routine -requires to know if this is an appropriate handler for the exception. Control +requires to determine if this is an appropriate handler for the exception. Control will transfer to the ``catchpad`` if this is the first appropriate handler for the exception. @@ -13886,7 +13886,7 @@ Semantics: The '``llvm.va_copy``' intrinsic works just like the ``va_copy`` macro available in C. In a target-dependent way, it copies the source ``va_list`` element into the destination ``va_list`` element. This -intrinsic is necessary because the `` llvm.va_start`` intrinsic may be +intrinsic is necessary because the ``llvm.va_start`` intrinsic may be arbitrarily complex and require, for example, memory allocation. Accurate Garbage Collection Intrinsics @@ -14077,7 +14077,7 @@ types of the 'call parameters' arguments. The '#call args' operand is the number of arguments to the actual call. It must exactly match the number of arguments passed in the -'call parameters' variable length section. +'call parameters' variable-length section. The 'flags' operand is used to specify extra information about the statepoint. This is currently only used to mark certain statepoints @@ -14198,7 +14198,7 @@ so constructed. The third argument is an index which specify the (potentially) derived pointer being relocated. It is legal for this index to be the same as the second -argument if-and-only-if a base pointer is being relocated. +argument if and only if a base pointer is being relocated. Semantics: """""""""" @@ -14894,7 +14894,7 @@ Overview: """"""""" The '``llvm.instrprof.increment``' intrinsic can be emitted by a -frontend for use with instrumentation based profiling. These will be +frontend for use with instrumentation-based profiling. These will be lowered by the ``-instrprof`` pass to generate execution counts of a program at runtime. @@ -15097,7 +15097,7 @@ Overview: """"""""" The '``llvm.instrprof.value.profile``' intrinsic can be emitted by a -frontend for use with instrumentation based profiling. This will be +frontend for use with instrumentation-based profiling. This will be lowered by the ``-instrprof`` pass to find out the target values, instrumented expressions take in a program at runtime. @@ -18514,7 +18514,7 @@ Overview: """"""""" The '``llvm.umul.with.overflow``' family of intrinsic functions perform -a unsigned multiplication of the two arguments, and indicate whether an +an unsigned multiplication of the two arguments, and indicate whether an overflow occurred during the unsigned multiplication. Arguments: @@ -20681,7 +20681,7 @@ Semantics: The '``llvm.experimental.vector.histogram.*``' intrinsics are used to perform updates on potentially overlapping values in memory. The intrinsics represent -the follow sequence of operations: +the following sequence of operations: 1. Gather load from the ``ptrs`` operand, with element type matching that of the ``inc`` operand. @@ -28541,7 +28541,7 @@ environment. The rounding mode argument is only intended as information to the compiler. If the runtime floating-point environment is using the default rounding mode -then the results will be the same as the ``llvm.llrint intrinsic``. +then the results will be the same as the ``llvm.llrint`` intrinsic. '``llvm.experimental.constrained.nearbyint``' Intrinsic @@ -29002,7 +29002,7 @@ was only valid within a single iteration. .. code-block:: llvm - ; This examples shows two possible positions for noalias.decl and how they impact the semantics: + ; This example shows two possible positions for noalias.decl and how they impact the semantics: ; If it is outside the loop (Version 1), then %a and %b are noalias across *all* iterations. ; If it is inside the loop (Version 2), then %a and %b are noalias only within *one* iteration. declare void @decl_in_loop(ptr %a.base, ptr %b.base) { diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst index a29e06cdf838..f9f3e39727a5 100644 --- a/llvm/docs/RISCVUsage.rst +++ b/llvm/docs/RISCVUsage.rst @@ -531,6 +531,10 @@ The current vendor extensions supported are: ``XAndesVDot`` LLVM implements `version 5.0.0 of the Andes Vector Dot Product Extension specification `__ by Andes Technology. All instructions are prefixed with `nds.` as described in the specification. +``XSMTVDot`` + SpacemiT defines `Intrinsic Matrix Extension (IME) specification `__. + LLVM implement the hardware-adapted subset for SpacemiT X60, defined in the `feature document `__ by SpacemiT. All instructions are prefixed with `smt.` as described in the implementation guide. Note that this implemented subset is `version 1.0.0 of the SpacemiT Vector Dot Product Extension specification`, which is strictly a subset of the full IME specification to reflect the capabilities of SpacemiT X60 hardware correctly. + Experimental C Intrinsics ========================= diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md index ef7586a6bab7..3b90c964ac53 100644 --- a/llvm/docs/ReleaseNotes.md +++ b/llvm/docs/ReleaseNotes.md @@ -118,6 +118,7 @@ Changes to the RISC-V Backend * `llvm-objdump` now has basic support for switching between disassembling code and data using mapping symbols such as `$x` and `$d`. Switching architectures using `$x` with an architecture string suffix is not yet supported. +* Ssctr and Smctr extensions are no longer experimental. Changes to the WebAssembly Backend ---------------------------------- diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h index 0692071adb39..85d4f2d5ee28 100644 --- a/llvm/include/llvm/ADT/SetVector.h +++ b/llvm/include/llvm/ADT/SetVector.h @@ -313,9 +313,8 @@ public: bool set_union(const STy &S) { bool Changed = false; - for (typename STy::const_iterator SI = S.begin(), SE = S.end(); SI != SE; - ++SI) - if (insert(*SI)) + for (const auto &Elem : S) + if (insert(Elem)) Changed = true; return Changed; @@ -326,9 +325,8 @@ public: /// SetVector interface is inconsistent with DenseSet. template void set_subtract(const STy &S) { - for (typename STy::const_iterator SI = S.begin(), SE = S.end(); SI != SE; - ++SI) - remove(*SI); + for (const auto &Elem : S) + remove(Elem); } void swap(SetVector &RHS) { diff --git a/llvm/include/llvm/ADT/SmallPtrSet.h b/llvm/include/llvm/ADT/SmallPtrSet.h index 73ec7c68f65c..f55627c6866f 100644 --- a/llvm/include/llvm/ADT/SmallPtrSet.h +++ b/llvm/include/llvm/ADT/SmallPtrSet.h @@ -62,10 +62,10 @@ protected: /// CurArraySize - The allocated size of CurArray, always a power of two. unsigned CurArraySize; - /// Number of elements in CurArray that contain a value or are a tombstone. + /// Number of elements in CurArray that contain a value. /// If small, all these elements are at the beginning of CurArray and the rest /// is uninitialized. - unsigned NumNonEmpty; + unsigned NumEntries; /// Number of tombstones in CurArray. unsigned NumTombstones; /// Whether the set is in small representation. @@ -79,7 +79,7 @@ protected: SmallPtrSetImplBase &&that); explicit SmallPtrSetImplBase(const void **SmallStorage, unsigned SmallSize) - : CurArray(SmallStorage), CurArraySize(SmallSize), NumNonEmpty(0), + : CurArray(SmallStorage), CurArraySize(SmallSize), NumEntries(0), NumTombstones(0), IsSmall(true) { assert(llvm::has_single_bit(SmallSize) && "Initial size must be a power of two!"); @@ -96,7 +96,7 @@ public: SmallPtrSetImplBase &operator=(const SmallPtrSetImplBase &) = delete; [[nodiscard]] bool empty() const { return size() == 0; } - size_type size() const { return NumNonEmpty - NumTombstones; } + size_type size() const { return NumEntries; } size_type capacity() const { return CurArraySize; } void clear() { @@ -110,25 +110,25 @@ public: memset(CurArray, -1, CurArraySize * sizeof(void *)); } - NumNonEmpty = 0; + NumEntries = 0; NumTombstones = 0; } - void reserve(size_type NumEntries) { + void reserve(size_type NewNumEntries) { incrementEpoch(); // Do nothing if we're given zero as a reservation size. - if (NumEntries == 0) + if (NewNumEntries == 0) return; - // No need to expand if we're small and NumEntries will fit in the space. - if (isSmall() && NumEntries <= CurArraySize) + // No need to expand if we're small and NewNumEntries will fit in the space. + if (isSmall() && NewNumEntries <= CurArraySize) return; // insert_imp_big will reallocate if stores is more than 75% full, on the // /final/ insertion. - if (!isSmall() && ((NumEntries - 1) * 4) < (CurArraySize * 3)) + if (!isSmall() && ((NewNumEntries - 1) * 4) < (CurArraySize * 3)) return; // We must Grow -- find the size where we'd be 75% full, then round up to // the next power of two. - size_type NewSize = NumEntries + (NumEntries / 3); + size_type NewSize = NewNumEntries + (NewNumEntries / 3); NewSize = llvm::bit_ceil(NewSize); // Like insert_imp_big, always allocate at least 128 elements. NewSize = std::max(128u, NewSize); @@ -145,21 +145,25 @@ protected: } const void **EndPointer() const { - return isSmall() ? CurArray + NumNonEmpty : CurArray + CurArraySize; + return isSmall() ? CurArray + NumEntries : CurArray + CurArraySize; } iterator_range small_buckets() { - return make_range(CurArray, CurArray + NumNonEmpty); + return make_range(CurArray, CurArray + NumEntries); } iterator_range small_buckets() const { - return {CurArray, CurArray + NumNonEmpty}; + return {CurArray, CurArray + NumEntries}; } iterator_range buckets() { return make_range(CurArray, EndPointer()); } + iterator_range buckets() const { + return make_range(CurArray, EndPointer()); + } + /// insert_imp - This returns true if the pointer was new to the set, false if /// it was already in the set. This is hidden from the client so that the /// derived class can check that the right type of pointer is passed in. @@ -172,10 +176,10 @@ protected: } // Nope, there isn't. If we stay small, just 'pushback' now. - if (NumNonEmpty < CurArraySize) { - CurArray[NumNonEmpty++] = Ptr; + if (NumEntries < CurArraySize) { + CurArray[NumEntries++] = Ptr; incrementEpoch(); - return std::make_pair(CurArray + (NumNonEmpty - 1), true); + return std::make_pair(CurArray + (NumEntries - 1), true); } // Otherwise, hit the big set case, which will call grow. } @@ -190,7 +194,7 @@ protected: if (isSmall()) { for (const void *&Bucket : small_buckets()) { if (Bucket == Ptr) { - Bucket = CurArray[--NumNonEmpty]; + Bucket = CurArray[--NumEntries]; incrementEpoch(); return true; } @@ -204,6 +208,7 @@ protected: *const_cast(Bucket) = getTombstoneMarker(); NumTombstones++; + --NumEntries; // Treat this consistently from an API perspective, even if we don't // actually invalidate iterators here. incrementEpoch(); @@ -430,12 +435,13 @@ public: bool remove_if(UnaryPredicate P) { bool Removed = false; if (isSmall()) { - const void **APtr = CurArray, **E = CurArray + NumNonEmpty; + auto Buckets = small_buckets(); + const void **APtr = Buckets.begin(), **E = Buckets.end(); while (APtr != E) { PtrType Ptr = PtrTraits::getFromVoidPointer(const_cast(*APtr)); if (P(Ptr)) { *APtr = *--E; - --NumNonEmpty; + --NumEntries; incrementEpoch(); Removed = true; } else { @@ -452,6 +458,7 @@ public: if (P(Ptr)) { Bucket = getTombstoneMarker(); ++NumTombstones; + --NumEntries; incrementEpoch(); Removed = true; } diff --git a/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h b/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h index 896b68c5021b..6bfad783b529 100644 --- a/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h +++ b/llvm/include/llvm/Analysis/GenericDomTreeUpdaterImpl.h @@ -383,7 +383,7 @@ void GenericDomTreeUpdater:: // field of all the elements of Edges. // I.e., forall elt in Edges, it exists BB in NewBBs // such as BB == elt.NewBB. - SmallSet NewBBs; + SmallPtrSet NewBBs; for (auto &Edge : Edges) NewBBs.insert(Edge.NewBB); // For each element in Edges, remember whether or not element diff --git a/llvm/include/llvm/BinaryFormat/MsgPackDocument.h b/llvm/include/llvm/BinaryFormat/MsgPackDocument.h index 26fff8d5f8d3..f09feabb1028 100644 --- a/llvm/include/llvm/BinaryFormat/MsgPackDocument.h +++ b/llvm/include/llvm/BinaryFormat/MsgPackDocument.h @@ -213,6 +213,7 @@ public: LLVM_ABI DocNode &operator=(unsigned Val); LLVM_ABI DocNode &operator=(int64_t Val); LLVM_ABI DocNode &operator=(uint64_t Val); + LLVM_ABI DocNode &operator=(double Val); private: // Private constructor setting KindAndDoc, used by methods in Document. diff --git a/llvm/include/llvm/BinaryFormat/SFrame.h b/llvm/include/llvm/BinaryFormat/SFrame.h index 74e47ea8acca..095db18b9c25 100644 --- a/llvm/include/llvm/BinaryFormat/SFrame.h +++ b/llvm/include/llvm/BinaryFormat/SFrame.h @@ -104,14 +104,8 @@ template struct Header { detail::packed FREOff; }; -template struct FuncDescEntry { - detail::packed StartAddress; - detail::packed Size; - detail::packed StartFREOff; - detail::packed NumFREs; +template struct FDEInfo { detail::packed Info; - detail::packed RepSize; - detail::packed Padding2; uint8_t getPAuthKey() const { return (Info >> 5) & 1; } FDEType getFDEType() const { return static_cast((Info >> 4) & 1); } @@ -125,6 +119,16 @@ template struct FuncDescEntry { } }; +template struct FuncDescEntry { + detail::packed StartAddress; + detail::packed Size; + detail::packed StartFREOff; + detail::packed NumFREs; + FDEInfo Info; + detail::packed RepSize; + detail::packed Padding2; +}; + template struct FREInfo { detail::packed Info; diff --git a/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h index a085479b8523..0083d3dd1ffa 100644 --- a/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h +++ b/llvm/include/llvm/CAS/MappedFileRegionBumpPtr.h @@ -5,6 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// +/// \file +/// This file declares interface for MappedFileRegionBumpPtr, a bump pointer +/// allocator, backed by a memory-mapped file. +/// +//===----------------------------------------------------------------------===// #ifndef LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H #define LLVM_CAS_MAPPEDFILEREGIONBUMPPTR_H diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h index cee779a5fd5d..4b7506e01376 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LoadStoreOpt.h @@ -162,7 +162,7 @@ private: DenseMap LegalStoreSizes; bool IsPreLegalizer = false; /// Contains instructions to be erased at the end of a block scan. - SmallSet InstsToErase; + SmallPtrSet InstsToErase; public: LoadStoreOpt(); diff --git a/llvm/include/llvm/CodeGen/LiveVariables.h b/llvm/include/llvm/CodeGen/LiveVariables.h index 974bf9eaa037..dbf736ad65a9 100644 --- a/llvm/include/llvm/CodeGen/LiveVariables.h +++ b/llvm/include/llvm/CodeGen/LiveVariables.h @@ -165,10 +165,8 @@ private: // Intermediate data structures MachineInstr *FindLastRefOrPartRef(Register Reg); /// FindLastPartialDef - Return the last partial def of the specified - /// register. Also returns the sub-registers that're defined by the - /// instruction. - MachineInstr *FindLastPartialDef(Register Reg, - SmallSet &PartDefRegs); + /// register. + MachineInstr *FindLastPartialDef(Register Reg); /// analyzePHINodes - Gather information about the PHI nodes in here. In /// particular, we want to map the variable information of a virtual diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index e50443d25cc6..c90ff4f3daa4 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -830,7 +830,7 @@ public: return ScheduledInstrs[cycle]; } - SmallSet + SmallPtrSet computeUnpipelineableNodes(SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI); diff --git a/llvm/include/llvm/CodeGen/ScheduleDAG.h b/llvm/include/llvm/CodeGen/ScheduleDAG.h index 122b7be96b46..aee151458148 100644 --- a/llvm/include/llvm/CodeGen/ScheduleDAG.h +++ b/llvm/include/llvm/CodeGen/ScheduleDAG.h @@ -237,7 +237,7 @@ class TargetRegisterInfo; }; /// Keep record of which SUnit are in the same cluster group. - typedef SmallSet ClusterInfo; + typedef SmallPtrSet ClusterInfo; constexpr unsigned InvalidClusterId = ~0u; /// Return whether the input cluster ID's are the same and valid. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 272d7dd5f45e..4480ced63745 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -301,6 +301,9 @@ public: public: Value *Val; SDValue Node; + /// Original unlegalized argument type. + Type *OrigTy; + /// Same as OrigTy, or partially legalized for soft float libcalls. Type *Ty; bool IsSExt : 1; bool IsZExt : 1; @@ -321,9 +324,9 @@ public: Type *IndirectType = nullptr; ArgListEntry(Value *Val, SDValue Node, Type *Ty) - : Val(Val), Node(Node), Ty(Ty), IsSExt(false), IsZExt(false), - IsNoExt(false), IsInReg(false), IsSRet(false), IsNest(false), - IsByVal(false), IsByRef(false), IsInAlloca(false), + : Val(Val), Node(Node), OrigTy(Ty), Ty(Ty), IsSExt(false), + IsZExt(false), IsNoExt(false), IsInReg(false), IsSRet(false), + IsNest(false), IsByVal(false), IsByRef(false), IsInAlloca(false), IsPreallocated(false), IsReturned(false), IsSwiftSelf(false), IsSwiftAsync(false), IsSwiftError(false), IsCFGuardTarget(false) {} @@ -3571,6 +3574,12 @@ public: return Libcalls.getMemcpyName().data(); } + /// Check if this is valid libcall for the current module, otherwise + /// RTLIB::Unsupported. + RTLIB::LibcallImpl getSupportedLibcallImpl(StringRef FuncName) const { + return Libcalls.getSupportedLibcallImpl(FuncName); + } + /// Get the comparison predicate that's to be used to test the result of the /// comparison libcall against zero. This should only be used with /// floating-point compare libcalls. @@ -4677,6 +4686,9 @@ public: /// implementation. struct CallLoweringInfo { SDValue Chain; + /// Original unlegalized return type. + Type *OrigRetTy = nullptr; + /// Same as OrigRetTy, or partially legalized for soft float libcalls. Type *RetTy = nullptr; bool RetSExt : 1; bool RetZExt : 1; @@ -4731,6 +4743,14 @@ public: // setCallee with target/module-specific attributes CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList) { + return setLibCallee(CC, ResultType, ResultType, Target, + std::move(ArgsList)); + } + + CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType, + Type *OrigResultType, SDValue Target, + ArgListTy &&ArgsList) { + OrigRetTy = OrigResultType; RetTy = ResultType; Callee = Target; CallConv = CC; @@ -4745,7 +4765,7 @@ public: CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs = {}) { - RetTy = ResultType; + RetTy = OrigRetTy = ResultType; IsInReg = ResultAttrs.hasAttribute(Attribute::InReg); RetSExt = ResultAttrs.hasAttribute(Attribute::SExt); RetZExt = ResultAttrs.hasAttribute(Attribute::ZExt); @@ -4761,7 +4781,7 @@ public: CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy, SDValue Target, ArgListTy &&ArgsList, const CallBase &Call) { - RetTy = ResultType; + RetTy = OrigRetTy = ResultType; IsInReg = Call.hasRetAttr(Attribute::InReg); DoesNotReturn = diff --git a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h index e44612af071b..87777fddc915 100644 --- a/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h +++ b/llvm/include/llvm/Frontend/HLSL/HLSLRootSignature.h @@ -42,10 +42,9 @@ struct RootConstants { dxbc::ShaderVisibility Visibility = dxbc::ShaderVisibility::All; }; -enum class DescriptorType : uint8_t { SRV = 0, UAV, CBuffer }; // Models RootDescriptor : CBV | SRV | UAV, by collecting like parameters struct RootDescriptor { - DescriptorType Type; + dxil::ResourceClass Type; Register Reg; uint32_t Space = 0; dxbc::ShaderVisibility Visibility = dxbc::ShaderVisibility::All; @@ -60,13 +59,16 @@ struct RootDescriptor { assert(Version == llvm::dxbc::RootSignatureVersion::V1_1 && "Specified an invalid root signature version"); switch (Type) { - case DescriptorType::CBuffer: - case DescriptorType::SRV: + case dxil::ResourceClass::CBuffer: + case dxil::ResourceClass::SRV: Flags = dxbc::RootDescriptorFlags::DataStaticWhileSetAtExecute; break; - case DescriptorType::UAV: + case dxil::ResourceClass::UAV: Flags = dxbc::RootDescriptorFlags::DataVolatile; break; + case dxil::ResourceClass::Sampler: + llvm_unreachable( + "ResourceClass::Sampler is not valid for RootDescriptors"); } } }; @@ -82,9 +84,8 @@ struct DescriptorTable { static const uint32_t NumDescriptorsUnbounded = 0xffffffff; static const uint32_t DescriptorTableOffsetAppend = 0xffffffff; // Models DTClause : CBV | SRV | UAV | Sampler, by collecting like parameters -using ClauseType = llvm::dxil::ResourceClass; struct DescriptorTableClause { - ClauseType Type; + dxil::ResourceClass Type; Register Reg; uint32_t NumDescriptors = 1; uint32_t Space = 0; @@ -94,7 +95,7 @@ struct DescriptorTableClause { void setDefaultFlags(dxbc::RootSignatureVersion Version) { if (Version == dxbc::RootSignatureVersion::V1_0) { Flags = dxbc::DescriptorRangeFlags::DescriptorsVolatile; - if (Type != ClauseType::Sampler) + if (Type != dxil::ResourceClass::Sampler) Flags |= dxbc::DescriptorRangeFlags::DataVolatile; return; } @@ -102,14 +103,14 @@ struct DescriptorTableClause { assert(Version == dxbc::RootSignatureVersion::V1_1 && "Specified an invalid root signature version"); switch (Type) { - case ClauseType::CBuffer: - case ClauseType::SRV: + case dxil::ResourceClass::CBuffer: + case dxil::ResourceClass::SRV: Flags = dxbc::DescriptorRangeFlags::DataStaticWhileSetAtExecute; break; - case ClauseType::UAV: + case dxil::ResourceClass::UAV: Flags = dxbc::DescriptorRangeFlags::DataVolatile; break; - case ClauseType::Sampler: + case dxil::ResourceClass::Sampler: Flags = dxbc::DescriptorRangeFlags::None; break; } diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h index c24f01fe26cc..533808e0666d 100644 --- a/llvm/include/llvm/IR/BasicBlock.h +++ b/llvm/include/llvm/IR/BasicBlock.h @@ -68,7 +68,7 @@ private: // Allow Function to renumber blocks. friend class Function; /// Per-function unique number. - unsigned Number = -1u; + unsigned Number = ~0u; friend class BlockAddress; friend class SymbolTableListTraits; diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index 5d25804a684a..2eb4fd36c5b7 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -584,9 +584,10 @@ public: dropUBImplyingAttrsAndUnknownMetadata(ArrayRef KnownIDs = {}); /// Drop any attributes or metadata that can cause immediate undefined - /// behavior. Retain other attributes/metadata on a best-effort basis. - /// This should be used when speculating instructions. - LLVM_ABI void dropUBImplyingAttrsAndMetadata(); + /// behavior. Retain other attributes/metadata on a best-effort basis, as well + /// as those passed in `Keep`. This should be used when speculating + /// instructions. + LLVM_ABI void dropUBImplyingAttrsAndMetadata(ArrayRef Keep = {}); /// Return true if this instruction has UB-implying attributes /// that can cause immediate undefined behavior. diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 1bcc442a3f77..77ef79debac1 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -128,12 +128,12 @@ // * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32) // * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap // * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap -// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) -// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) -// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) -// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y) -// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) -// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y) +// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0) +// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x) +// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x) +// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y) +// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x) +// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y) def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr @@ -793,38 +793,49 @@ class NVVMBuiltin : "NVVMBuiltin must be a NVVM intrinsic starting with 'int_nvvm_'"; } +class PureIntrinsic ret_types, + list param_types = [], + list intr_properties = [], + string name = ""> : + DefaultAttrsIntrinsic {} + let TargetPrefix = "nvvm" in { + // // PRMT - permute + // + def int_nvvm_prmt : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - def int_nvvm_prmt : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + foreach mode = ["f4e", "b4e"] in + def int_nvvm_prmt_ # mode : + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - foreach mode = ["f4e", "b4e"] in - def int_nvvm_prmt_ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - - // Note: these variants also have 2 source operands but only one will ever - // be used so we eliminate the other operand in the IR (0 is used as the - // placeholder in the backend). - foreach mode = ["rc8", "ecl", "ecr", "rc16"] in - def int_nvvm_prmt_ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; - } + // Note: these variants also have 2 source operands but only one will ever + // be used so we eliminate the other operand in the IR (0 is used as the + // placeholder in the backend). + foreach mode = ["rc8", "ecl", "ecr", "rc16"] in + def int_nvvm_prmt_ # mode : + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; + // + // Nanosleep + // def int_nvvm_nanosleep : NVVMBuiltin, DefaultAttrsIntrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoMem, IntrHasSideEffects]>; + // // Performance Monitor Events (pm events) intrinsics + // def int_nvvm_pm_event_mask : NVVMBuiltin, DefaultAttrsIntrinsic<[], [llvm_i16_ty], [IntrConvergent, IntrNoMem, IntrHasSideEffects, ImmArg>]>; -// -// Min Max -// + // + // Min Max + // let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { foreach operation = ["min", "max"] in { def int_nvvm_f # operation # _d : NVVMBuiltin, @@ -853,9 +864,9 @@ let TargetPrefix = "nvvm" in { } // operation } -// -// Multiplication -// + // + // Multiplication + // let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { foreach sign = ["", "u"] in { def int_nvvm_mulhi_ # sign # s : NVVMBuiltin, @@ -881,9 +892,9 @@ let TargetPrefix = "nvvm" in { } } -// -// Div -// + // + // Div + // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in { def int_nvvm_div_approx # ftz # _f : NVVMBuiltin, @@ -903,90 +914,79 @@ let TargetPrefix = "nvvm" in { } } -// -// Sad -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach sign = ["", "u"] in { - def int_nvvm_sad_ # sign # s : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>; + // + // Sad - Sum of Absolute Differences + // + foreach sign = ["", "u"] in { + def int_nvvm_sad_ # sign # s : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty]>; - def int_nvvm_sad_ # sign # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + def int_nvvm_sad_ # sign # i : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; - def int_nvvm_sad_ # sign # ll : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>; - } + def int_nvvm_sad_ # sign # ll : NVVMBuiltin, + PureIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i64_ty]>; } -// -// Floor Ceil -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach op = ["floor", "ceil"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_ # op # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_ # op # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + // + // Floor Ceil + // + foreach op = ["floor", "ceil"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_ # op # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + def int_nvvm_ # op # _d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } -// -// Abs -// + // + // Abs + // foreach ftz = ["", "_ftz"] in def int_nvvm_fabs # ftz : - DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>; -// -// Abs, Neg bf16, bf16x2 -// + // + // Neg bf16, bf16x2 + // def int_nvvm_neg_bf16 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty], [IntrNoMem]>; + PureIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty]>; def int_nvvm_neg_bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty], [IntrNoMem]>; + PureIntrinsic<[llvm_v2bf16_ty], [llvm_v2bf16_ty]>; -// -// Round -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_round # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + // + // Round + // + foreach ftz = ["", "_ftz"] in + def int_nvvm_round # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_round_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + def int_nvvm_round_d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; -// -// Trunc -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_trunc # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + // + // Trunc + // + foreach ftz = ["", "_ftz"] in + def int_nvvm_trunc # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_trunc_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + def int_nvvm_trunc_d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; -// -// Saturate -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_saturate # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; + // + // Saturate + // + foreach ftz = ["", "_ftz"] in + def int_nvvm_saturate # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_float_ty]>; - def int_nvvm_saturate_d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; - } + def int_nvvm_saturate_d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_double_ty]>; -// -// Exp2 Log2 -// + // + // Exp2 Log2 + // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in def int_nvvm_ex2_approx # ftz # _f : NVVMBuiltin, @@ -1007,53 +1007,51 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } -// -// Sin Cos -// + // + // Sin Cos + // foreach op = ["sin", "cos"] in foreach ftz = ["", "_ftz"] in def int_nvvm_ # op # _approx # ftz # _f : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; -// -// Fma -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - foreach variant = ["", "_sat", "_relu"] in { - foreach ftz = ["", "_ftz"] in { - def int_nvvm_fma_rn # ftz # variant # _f16 : - DefaultAttrsIntrinsic<[llvm_half_ty], - [llvm_half_ty, llvm_half_ty, llvm_half_ty]>; + // + // Fma + // + foreach variant = ["", "_sat", "_relu"] in { + foreach ftz = ["", "_ftz"] in { + def int_nvvm_fma_rn # ftz # variant # _f16 : + PureIntrinsic<[llvm_half_ty], + [llvm_half_ty, llvm_half_ty, llvm_half_ty]>; - def int_nvvm_fma_rn # ftz # variant # _f16x2 : - DefaultAttrsIntrinsic<[llvm_v2f16_ty], - [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty]>; + def int_nvvm_fma_rn # ftz # variant # _f16x2 : + PureIntrinsic<[llvm_v2f16_ty], + [llvm_v2f16_ty, llvm_v2f16_ty, llvm_v2f16_ty]>; - def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], - [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty]>; + def int_nvvm_fma_rn # ftz # variant # _bf16 : NVVMBuiltin, + PureIntrinsic<[llvm_bfloat_ty], + [llvm_bfloat_ty, llvm_bfloat_ty, llvm_bfloat_ty]>; - def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], - [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty]>; - } // ftz - } // variant + def int_nvvm_fma_rn # ftz # variant # _bf16x2 : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], + [llvm_v2bf16_ty, llvm_v2bf16_ty, llvm_v2bf16_ty]>; + } // ftz + } // variant - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty]>; + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_fma_ # rnd # ftz # _f : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_float_ty, llvm_float_ty]>; - def int_nvvm_fma_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], - [llvm_double_ty, llvm_double_ty, llvm_double_ty]>; - } + def int_nvvm_fma_ # rnd # _d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], + [llvm_double_ty, llvm_double_ty, llvm_double_ty]>; } -// -// Rcp -// + // + // Rcp + // let IntrProperties = [IntrNoMem] in { foreach rnd = ["rn", "rz", "rm", "rp"] in { foreach ftz = ["", "_ftz"] in @@ -1070,9 +1068,9 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty]>; } -// -// Sqrt -// + // + // Sqrt + // let IntrProperties = [IntrNoMem] in { foreach rnd = ["rn", "rz", "rm", "rp"] in { foreach ftz = ["", "_ftz"] in @@ -1091,9 +1089,9 @@ let TargetPrefix = "nvvm" in { DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty]>; } -// -// Rsqrt -// + // + // Rsqrt + // let IntrProperties = [IntrNoMem] in { foreach ftz = ["", "_ftz"] in { def int_nvvm_rsqrt_approx # ftz # _f : NVVMBuiltin, @@ -1103,208 +1101,206 @@ let TargetPrefix = "nvvm" in { } } -// -// Add -// + // + // Add + // let IntrProperties = [IntrNoMem, IntrSpeculatable, Commutative] in { foreach rnd = ["rn", "rz", "rm", "rp"] in { foreach ftz = ["", "_ftz"] in def int_nvvm_add_ # rnd # ftz # _f : NVVMBuiltin, DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty]>; - def int_nvvm_add_ # rnd # _d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; + def int_nvvm_add_ # rnd # _d : NVVMBuiltin, + DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty]>; } } -// -// Dot Product -// + // + // Dot Product + // foreach a_type = ["s", "u"] in { foreach b_type = ["s", "u"] in { def int_nvvm_idp4a_ # a_type # _ # b_type : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_i32_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; def int_nvvm_idp2a_ # a_type # _ # b_type : - DefaultAttrsIntrinsic<[llvm_i32_ty], + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg>]>; + [ImmArg>]>; } } -// -// Funnel-shift -// + // + // Funnel-shift + // foreach direction = ["l", "r"] in def int_nvvm_fsh # direction # _clamp : - DefaultAttrsIntrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_anyint_ty], + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>; -// -// FLO - Find Leading One -// + // + // FLO - Find Leading One + // foreach sign = ["s", "u"] in def int_nvvm_flo_ # sign : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_anyint_ty, llvm_i1_ty], - [IntrNoMem, IntrSpeculatable, ImmArg>]>; + PureIntrinsic<[llvm_i32_ty], [llvm_anyint_ty, llvm_i1_ty], + [ImmArg>]>; -// -// szext -// + // + // szext + // foreach ext = ["sext", "zext"] in foreach mode = ["wrap", "clamp"] in def int_nvvm_ # ext # _ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; -// -// BMSK - bit mask -// + // + // BMSK - bit mask + // foreach mode = ["wrap", "clamp"] in def int_nvvm_bmsk_ # mode : - DefaultAttrsIntrinsic<[llvm_i32_ty], - [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable]>; + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty]>; -// -// Convert -// - let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - def int_nvvm_lohi_i2d : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>; - - def int_nvvm_d2i_lo : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - def int_nvvm_d2i_hi : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - - foreach rnd = ["rn", "rz", "rm", "rp"] in { - foreach ftz = ["", "_ftz"] in - def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_double_ty]>; - - foreach sign = ["", "u"] in { - - def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; - - def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i32_ty]>; - - foreach ftz = ["", "_ftz"] in - def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - - def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i32_ty]>; - - foreach ftz = ["", "_ftz"] in - def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_float_ty]>; - - def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_double_ty]>; - - def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_i64_ty]>; - - def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_i64_ty]>; - - } // sign - } // rnd - - foreach ftz = ["", "_ftz"] in { - def int_nvvm_f2h_rn # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty]>; - - def int_nvvm_bf2h_rn # ftz : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>; - } - - foreach rnd = ["rn", "rz"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; - } - } - - foreach satfinite = ["", "_satfinite"] in { - def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - - foreach rnd = ["rn", "rz"] in - foreach relu = ["", "_relu"] in - def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; - } - - foreach type = ["e4m3x2", "e5m2x2"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>; - - def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; - } - } - - // FP4 conversions. - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_e2m1x2_rn # relu # _satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; - } - - // FP6 conversions. - foreach type = ["e2m3x2", "e3m2x2"] in { - foreach relu = ["", "_relu"] in { - def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; - } - } - - // UE8M0x2 conversions. - foreach rmode = ["_rz", "_rp"] in { - foreach satmode = ["", "_satfinite"] in { - defvar suffix = rmode # satmode; - def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; - - def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>; - - } - } - - def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>; - - } // IntrProperties = [IntrNoMem, IntrSpeculatable] - -// FNS + // + // FNS - Find the n-th set bit + // def int_nvvm_fns : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; + PureIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>; + // + // Convert + // + // TODO: All these intrinsics are defined as PureIntrinsic, this attaches the + // IntrSpeculatable property to them. Consider if some of these should + // have this attribute removed as they may be too expensive. + // + def int_nvvm_lohi_i2d : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_i32_ty, llvm_i32_ty]>; + + def int_nvvm_d2i_lo : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + def int_nvvm_d2i_hi : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + + foreach rnd = ["rn", "rz", "rm", "rp"] in { + foreach ftz = ["", "_ftz"] in + def int_nvvm_d2f_ # rnd # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_double_ty]>; + + foreach sign = ["", "u"] in { + + def int_nvvm_d2 # sign # i_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_double_ty]>; + + def int_nvvm_ # sign # i2d_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_i32_ty]>; + + foreach ftz = ["", "_ftz"] in + def int_nvvm_f2 # sign # i_ # rnd # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; + + def int_nvvm_ # sign # i2f_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_i32_ty]>; + + foreach ftz = ["", "_ftz"] in + def int_nvvm_f2 # sign # ll_ # rnd # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i64_ty], [llvm_float_ty]>; + + def int_nvvm_d2 # sign # ll_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_i64_ty], [llvm_double_ty]>; + + def int_nvvm_ # sign # ll2f_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_float_ty], [llvm_i64_ty]>; + + def int_nvvm_ # sign # ll2d_ # rnd : NVVMBuiltin, + PureIntrinsic<[llvm_double_ty], [llvm_i64_ty]>; + + } // sign + } // rnd + + foreach ftz = ["", "_ftz"] in { + def int_nvvm_f2h_rn # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty]>; + + def int_nvvm_bf2h_rn # ftz : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_bfloat_ty]>; + } + + foreach rnd = ["rn", "rz"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff2bf16x2_ # rnd # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_ff2f16x2_ # rnd # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_f2bf16_ # rnd # relu : NVVMBuiltin, + PureIntrinsic<[llvm_bfloat_ty], [llvm_float_ty]>; + } + } + + foreach satfinite = ["", "_satfinite"] in { + def int_nvvm_f2tf32_rna # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; + + foreach rnd = ["rn", "rz"] in + foreach relu = ["", "_relu"] in + def int_nvvm_f2tf32_ # rnd # relu # satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i32_ty], [llvm_float_ty]>; + } + + foreach type = ["e4m3x2", "e5m2x2"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_f16x2_to_ # type # _rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_v2f16_ty]>; + + def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; + } + } + + // FP4 conversions. + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_e2m1x2_rn # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_e2m1x2_to_f16x2_rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; + } + + // FP6 conversions. + foreach type = ["e2m3x2", "e3m2x2"] in { + foreach relu = ["", "_relu"] in { + def int_nvvm_ff_to_ # type # _rn # relu # _satfinite : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_ # type # _to_f16x2_rn # relu : NVVMBuiltin, + PureIntrinsic<[llvm_v2f16_ty], [llvm_i16_ty]>; + } + } + + // UE8M0x2 conversions. + foreach rmode = ["_rz", "_rp"] in { + foreach satmode = ["", "_satfinite"] in { + defvar suffix = rmode # satmode; + def int_nvvm_ff_to_ue8m0x2 # suffix : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty]>; + + def int_nvvm_bf16x2_to_ue8m0x2 # suffix : NVVMBuiltin, + PureIntrinsic<[llvm_i16_ty], [llvm_v2bf16_ty]>; + + } + } + + def int_nvvm_ue8m0x2_to_bf16x2 : NVVMBuiltin, + PureIntrinsic<[llvm_v2bf16_ty], [llvm_i16_ty]>; + + // + // Atomic operations + // class SCOPED_ATOMIC2_impl : Intrinsic<[elty], [llvm_anyptr_ty, LLVMMatchType<0>], @@ -1337,7 +1333,9 @@ let TargetPrefix = "nvvm" in { defm int_nvvm_atomic_and_gen_i : PTXAtomicWithScope2; defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3; -// Bar.Sync + // + // Bar.Sync + // def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">, @@ -1361,62 +1359,65 @@ let TargetPrefix = "nvvm" in { } } - // barrier.cluster.[wait, arrive, arrive.relaxed] - def int_nvvm_barrier_cluster_arrive : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_arrive_relaxed : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_wait : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + let IntrProperties = [IntrConvergent, IntrNoCallback] in { + // barrier.cluster.[wait, arrive, arrive.relaxed] + def int_nvvm_barrier_cluster_arrive : Intrinsic<[]>; + def int_nvvm_barrier_cluster_arrive_relaxed : Intrinsic<[]>; + def int_nvvm_barrier_cluster_wait : Intrinsic<[]>; - // 'aligned' versions of the above barrier.cluster.* intrinsics - def int_nvvm_barrier_cluster_arrive_aligned : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_arrive_relaxed_aligned : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; - def int_nvvm_barrier_cluster_wait_aligned : - Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>; + // 'aligned' versions of the above barrier.cluster.* intrinsics + def int_nvvm_barrier_cluster_arrive_aligned : Intrinsic<[]>; + def int_nvvm_barrier_cluster_arrive_relaxed_aligned : Intrinsic<[]>; + def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>; + } + // // Membar - def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[], [], [IntrNoCallback]>; - def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[], [], [IntrNoCallback]>; - def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[], [], [IntrNoCallback]>; - def int_nvvm_fence_sc_cluster : Intrinsic<[], [], [IntrNoCallback]>; + // + let IntrProperties = [IntrNoCallback] in { + def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>; + def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>; + def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>; + def int_nvvm_fence_sc_cluster : Intrinsic<[]>; + } -// Proxy fence (uni-directional) -foreach scope = ["cta", "cluster", "gpu", "sys"] in { + // + // Proxy fence (uni-directional) + // + foreach scope = ["cta", "cluster", "gpu", "sys"] in { - def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : - Intrinsic<[], [], [IntrNoCallback], - "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; + def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : + Intrinsic<[], [], [IntrNoCallback], + "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; - // The imm-arg 'size' can only be 128. - def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoCallback, IntrArgMemOnly, ImmArg>, - Range, 128, 129>], - "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; -} + // The imm-arg 'size' can only be 128. + def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], + [IntrNoCallback, IntrArgMemOnly, ImmArg>, + Range, 128, 129>], + "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; + } +// // Async Copy +// let IntrProperties = [IntrConvergent, IntrNoCallback] in { def int_nvvm_cp_async_mbarrier_arrive : NVVMBuiltin, - Intrinsic<[],[llvm_ptr_ty]>; + Intrinsic<[], [llvm_ptr_ty]>; def int_nvvm_cp_async_mbarrier_arrive_shared : NVVMBuiltin, - Intrinsic<[],[llvm_shared_ptr_ty]>; + Intrinsic<[], [llvm_shared_ptr_ty]>; def int_nvvm_cp_async_mbarrier_arrive_noinc : NVVMBuiltin, - Intrinsic<[],[llvm_ptr_ty]>; + Intrinsic<[], [llvm_ptr_ty]>; def int_nvvm_cp_async_mbarrier_arrive_noinc_shared : NVVMBuiltin, - Intrinsic<[],[llvm_shared_ptr_ty]>; + Intrinsic<[], [llvm_shared_ptr_ty]>; } multiclass CP_ASYNC_SHARED_GLOBAL { - def NAME : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, - WriteOnly>, ReadOnly>]>; - def _s : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty, llvm_i32_ty], - [IntrArgMemOnly, IntrNoCallback, NoAlias>, NoAlias>, - WriteOnly>, ReadOnly>]>; + let IntrProperties = [IntrArgMemOnly, IntrNoCallback, NoAlias>, + NoAlias>, WriteOnly>, ReadOnly>] in { + def NAME : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty]>; + def _s : Intrinsic<[], [llvm_shared_ptr_ty, llvm_global_ptr_ty, llvm_i32_ty]>; + } } defm int_nvvm_cp_async_ca_shared_global_4 : CP_ASYNC_SHARED_GLOBAL; @@ -1424,17 +1425,15 @@ defm int_nvvm_cp_async_ca_shared_global_8 : CP_ASYNC_SHARED_GLOBAL; defm int_nvvm_cp_async_ca_shared_global_16 : CP_ASYNC_SHARED_GLOBAL; defm int_nvvm_cp_async_cg_shared_global_16 : CP_ASYNC_SHARED_GLOBAL; -def int_nvvm_cp_async_commit_group : NVVMBuiltin, Intrinsic<[], [], []>; +def int_nvvm_cp_async_commit_group : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_cp_async_wait_group : NVVMBuiltin, Intrinsic<[], [llvm_i32_ty], [ImmArg>]>; -def int_nvvm_cp_async_wait_all : NVVMBuiltin, - Intrinsic<[], [], []>; +def int_nvvm_cp_async_wait_all : NVVMBuiltin, Intrinsic<[]>; // cp.async.bulk variants of the commit/wait group -def int_nvvm_cp_async_bulk_commit_group : - Intrinsic<[], [], []>; +def int_nvvm_cp_async_bulk_commit_group : Intrinsic<[]>; def int_nvvm_cp_async_bulk_wait_group : Intrinsic<[], [llvm_i32_ty], [ImmArg>]>; @@ -1457,29 +1456,30 @@ def int_nvvm_mbarrier_inval_shared : NVVMBuiltin, [IntrConvergent, IntrWriteMem, IntrArgMemOnly, IntrNoCallback, WriteOnly>, NoCapture>]>; -def int_nvvm_mbarrier_arrive : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_noComplete : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_noComplete_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, - llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; +let IntrProperties = [IntrConvergent, IntrNoCallback] in { + def int_nvvm_mbarrier_arrive : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty]>; + def int_nvvm_mbarrier_arrive_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty]>; + def int_nvvm_mbarrier_arrive_noComplete : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty]>; + def int_nvvm_mbarrier_arrive_noComplete_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, llvm_i32_ty]>; -def int_nvvm_mbarrier_arrive_drop : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_drop_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_drop_noComplete : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_arrive_drop_noComplete_shared : NVVMBuiltin, - Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>; + def int_nvvm_mbarrier_arrive_drop : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty]>; + def int_nvvm_mbarrier_arrive_drop_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty]>; + def int_nvvm_mbarrier_arrive_drop_noComplete : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_ptr_ty, llvm_i32_ty]>; + def int_nvvm_mbarrier_arrive_drop_noComplete_shared : NVVMBuiltin, + Intrinsic<[llvm_i64_ty], [llvm_shared_ptr_ty, llvm_i32_ty]>; -def int_nvvm_mbarrier_test_wait : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i64_ty], [IntrConvergent, IntrNoCallback]>; -def int_nvvm_mbarrier_test_wait_shared : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_shared_ptr_ty, llvm_i64_ty], [IntrConvergent, IntrNoCallback]>; + def int_nvvm_mbarrier_test_wait : NVVMBuiltin, + Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i64_ty]>; + def int_nvvm_mbarrier_test_wait_shared : NVVMBuiltin, + Intrinsic<[llvm_i1_ty], [llvm_shared_ptr_ty, llvm_i64_ty]>; +} def int_nvvm_mbarrier_pending_count : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent, IntrNoCallback]>; @@ -1504,9 +1504,8 @@ let IntrProperties = [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillRetur // space when lowered during ISel. // def int_nvvm_internal_addrspace_wrap : - DefaultAttrsIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty], - [IntrNoMem, IntrSpeculatable, NoUndef>, - NoUndef]>; + PureIntrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty], + [NoUndef>, NoUndef]>; // Move intrinsics, used in nvvm internally @@ -1520,36 +1519,26 @@ let IntrProperties = [IntrNoMem] in { } // For getting the handle from a texture or surface variable -let IntrProperties = [IntrNoMem, IntrSpeculatable] in { - def int_nvvm_texsurf_handle - : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>; - def int_nvvm_texsurf_handle_internal - : DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; -} +def int_nvvm_texsurf_handle + : PureIntrinsic<[llvm_i64_ty], [llvm_metadata_ty, llvm_anyptr_ty]>; +def int_nvvm_texsurf_handle_internal + : PureIntrinsic<[llvm_i64_ty], [llvm_anyptr_ty]>; /// Error / Warn def int_nvvm_compiler_error : Intrinsic<[], [llvm_anyptr_ty]>; def int_nvvm_compiler_warn : Intrinsic<[], [llvm_anyptr_ty]>; -def int_nvvm_reflect : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_ptr_ty], [IntrNoMem]>; +def int_nvvm_reflect : NVVMBuiltin, PureIntrinsic<[llvm_i32_ty], [llvm_ptr_ty]>; // isspacep.{const, global, local, shared} foreach space = ["const", "global", "local", "shared", "shared_cluster"] in def int_nvvm_isspacep_ # space : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], - [IntrNoMem, IntrSpeculatable, NoCapture>]>; - -// Environment register read -foreach i = 0...31 in - def int_nvvm_read_ptx_sreg_envreg # i : NVVMBuiltin, - DefaultAttrsIntrinsic<[llvm_i32_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef]>; + PureIntrinsic<[llvm_i1_ty], [llvm_ptr_ty], [NoCapture>]>; // // Texture Fetch // -let IntrProperties = [IntrReadMem] in { +let IntrProperties = [IntrReadMem, IntrNoCallback, IntrNoFree, IntrWillReturn] in { foreach is_unified = [true, false] in { defvar mode = !if(is_unified, "_unified", ""); defvar addr_args = !if(is_unified, [llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty]); @@ -1558,76 +1547,63 @@ let IntrProperties = [IntrReadMem] in { foreach is_array = [true, false] in { defvar array = !if(is_array, "_array", ""); defvar array_args = !if(is_array, [llvm_i32_ty], []); + defvar base_args = !listconcat(addr_args, array_args); def int_nvvm_tex # mode # _1d # array # _ # vec.Name # _s32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _1d # array # _ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _1d # array # _level_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _1d # array # _grad_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _2d # array # _ # vec.Name # _s32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _2d # array # _ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _2d # array # _level_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _2d # array # _grad_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; if !not(is_array) then { def int_nvvm_tex # mode # _3d_ # vec.Name # _s32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _3d_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _3d_level_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _3d_grad_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; } def int_nvvm_tex # mode # _cube # array # _ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; def int_nvvm_tex # mode # _cube # array # _level_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; if is_unified then def int_nvvm_tex # mode # _cube # array # _grad_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; } // is_array foreach comp = ["r", "g", "b", "a"] in { def int_nvvm_tld4 # mode # _ # comp # _2d_ # vec.Name # _f32 - : Intrinsic; + : Intrinsic; } // comp } // vec } // is_unified } // IntrProperties = [IntrReadMem] //=== Surface Load -let IntrProperties = [IntrReadMem] in { - foreach clamp = ["clamp", "trap", "zero"] in { - foreach vec = [TV_I8, TV_I16, TV_I32, TV_I64, - TV_V2I8, TV_V2I16, TV_V2I32, TV_V2I64, - TV_V4I8, TV_V4I16, TV_V4I32] in { +foreach clamp = ["clamp", "trap", "zero"] in { + foreach vec = [TV_I8, TV_I16, TV_I32, TV_I64, + TV_V2I8, TV_V2I16, TV_V2I32, TV_V2I64, + TV_V4I8, TV_V4I16, TV_V4I32] in { + + let IntrProperties = [IntrNoCallback, IntrNoFree, IntrReadMem] + # !if(!ne(clamp, "trap"), [IntrWillReturn], []) in { def int_nvvm_suld_1d_ # vec.Name # _ # clamp : Intrinsic; - } // vec - } // clamp -} // IntrProperties = [IntrReadMem] + } + } // vec +} // clamp //===- Texture Query ------------------------------------------------------===// foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size", "num_samples", "num_mipmap_levels"] in def int_nvvm_txq_ # query : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; //===- Surface Query ------------------------------------------------------===// foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size"] in def int_nvvm_suq_ # query : NVVMBuiltin, - Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; //===- Handle Query -------------------------------------------------------===// foreach type = ["sampler", "surface", "texture"] in def int_nvvm_istypep_ # type : NVVMBuiltin, - Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem]>; + DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem]>; //===- Surface Stores -----------------------------------------------------===// multiclass SurfaceStoreIntrinsics { - def _1d_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty], vec.Types)>; + let IntrProperties = [IntrNoCallback, IntrNoFree, IntrWriteMem] # + !if(!ne(clamp, "trap"), [IntrWillReturn], []) in { + def _1d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty] # vec.Types>; - def _1d_array_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _1d_array_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; - def _2d_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _2d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; - def _2d_array_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _2d_array_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; - def _3d_ # vec.Name # _ # clamp : NVVMBuiltin, - Intrinsic<[], !listconcat([llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], vec.Types)>; + def _3d_ # vec.Name # _ # clamp : NVVMBuiltin, + Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty] # vec.Types>; + } } // Unformatted @@ -1704,23 +1683,17 @@ foreach vec = [TV_I8, TV_I16, TV_I32, TV_V4I8, TV_V4I16, TV_V4I32] in defm int_nvvm_sust_p : SurfaceStoreIntrinsics<"trap", vec>; +// // Accessing special registers. - +// class PTXReadSRegIntrinsicNB_r32 properties = []> - : DefaultAttrsIntrinsic<[llvm_i32_ty], [], - !listconcat([IntrNoMem, IntrSpeculatable, NoUndef], properties)>; + : PureIntrinsic<[llvm_i32_ty], [], [NoUndef] # properties>; class PTXReadSRegIntrinsic_r32 properties = []> - : PTXReadSRegIntrinsicNB_r32, - NVVMBuiltin; + : PTXReadSRegIntrinsicNB_r32, NVVMBuiltin; multiclass PTXReadSRegIntrinsic_v4i32> properties = [[], [], [], []]> { assert !eq(!size(properties), 4), "properties must be a list of 4 lists"; -// FIXME: Do we need the 128-bit integer type version? -// def _r64 : Intrinsic<[llvm_i128_ty], [], [IntrNoMem, IntrSpeculatable]>; - -// FIXME: Enable this once v4i32 support is enabled in back-end. -// def _v4i16 : Intrinsic<[llvm_v4i32_ty], [], [IntrNoMem, IntrSpeculatable]>; defvar suffixes = ["_x", "_y", "_z", "_w"]; foreach i = !range(suffixes) in def suffixes[i] : PTXReadSRegIntrinsic_r32; @@ -1737,30 +1710,20 @@ multiclass PTXReadSRegIntrinsicNB_v4i32> properties // Intrinsics to read registers with non-constant values. E.g. the values that // do change over the kernel lifetime. Such reads should not be CSE'd. -class PTXReadNCSRegIntrinsic_r32 - : Intrinsic<[llvm_i32_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef]>, - NVVMBuiltin; -class PTXReadNCSRegIntrinsic_r64 - : Intrinsic<[llvm_i64_ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, NoUndef]>, +class PTXReadNCSRegIntrinsic + : Intrinsic<[ty], [], [IntrInaccessibleMemOnly, IntrNoCallback, + IntrNoFree, IntrWillReturn, NoUndef]>, NVVMBuiltin; -defm int_nvvm_read_ptx_sreg_tid - : PTXReadSRegIntrinsic_v4i32<[[Range], - [Range], - [Range], - [Range]]>; +defvar MAX_BLOCK_ID_RANGE = [[Range], + [Range], + [Range], + [Range]]; -defm int_nvvm_read_ptx_sreg_ntid - : PTXReadSRegIntrinsic_v4i32<[[Range], - [Range], - [Range], - [Range]]>; - -def int_nvvm_read_ptx_sreg_laneid - : PTXReadSRegIntrinsic_r32<[Range]>; - -def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32; +defvar MAX_BLOCK_NID_RANGE = [[Range], + [Range], + [Range], + [Range]]; defvar MAX_GRID_ID_RANGE = [[Range], [Range], @@ -1772,11 +1735,17 @@ defvar MAX_GRID_NID_RANGE = [[Range], [Range], [Range]]; -defm int_nvvm_read_ptx_sreg_ctaid - : PTXReadSRegIntrinsic_v4i32; +defm int_nvvm_read_ptx_sreg_tid : PTXReadSRegIntrinsic_v4i32; +defm int_nvvm_read_ptx_sreg_ntid : PTXReadSRegIntrinsic_v4i32; -defm int_nvvm_read_ptx_sreg_nctaid - : PTXReadSRegIntrinsic_v4i32; +def int_nvvm_read_ptx_sreg_laneid + : PTXReadSRegIntrinsic_r32<[Range]>; + +def int_nvvm_read_ptx_sreg_warpid : PTXReadSRegIntrinsic_r32; +def int_nvvm_read_ptx_sreg_nwarpid : PTXReadSRegIntrinsic_r32; + +defm int_nvvm_read_ptx_sreg_ctaid : PTXReadSRegIntrinsic_v4i32; +defm int_nvvm_read_ptx_sreg_nctaid : PTXReadSRegIntrinsic_v4i32; def int_nvvm_read_ptx_sreg_smid : PTXReadSRegIntrinsic_r32; def int_nvvm_read_ptx_sreg_nsmid : PTXReadSRegIntrinsic_r32; @@ -1788,19 +1757,22 @@ def int_nvvm_read_ptx_sreg_lanemask_lt : PTXReadSRegIntrinsic_r32; def int_nvvm_read_ptx_sreg_lanemask_ge : PTXReadSRegIntrinsic_r32; def int_nvvm_read_ptx_sreg_lanemask_gt : PTXReadSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic_r64; +def int_nvvm_read_ptx_sreg_clock : PTXReadNCSRegIntrinsic; +def int_nvvm_read_ptx_sreg_clock64 : PTXReadNCSRegIntrinsic; -def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic_r64; +def int_nvvm_read_ptx_sreg_globaltimer : PTXReadNCSRegIntrinsic; -def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic_r32; -def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic_r32; +def int_nvvm_read_ptx_sreg_pm0 : PTXReadNCSRegIntrinsic; +def int_nvvm_read_ptx_sreg_pm1 : PTXReadNCSRegIntrinsic; +def int_nvvm_read_ptx_sreg_pm2 : PTXReadNCSRegIntrinsic; +def int_nvvm_read_ptx_sreg_pm3 : PTXReadNCSRegIntrinsic; def int_nvvm_read_ptx_sreg_warpsize : PTXReadSRegIntrinsic_r32<[Range]>; +foreach i = 0...31 in + def int_nvvm_read_ptx_sreg_envreg # i : PTXReadSRegIntrinsic_r32; + // sm90+, PTX7.8+ // Note: Since clusters are subdivisions of the grid, we conservatively use the @@ -1808,14 +1780,10 @@ def int_nvvm_read_ptx_sreg_warpsize // practice, the clusterid will likely be much smaller. The CUDA programming // guide recommends 8 as a maximum portable value and H100s support 16. -defm int_nvvm_read_ptx_sreg_clusterid - : PTXReadSRegIntrinsicNB_v4i32; -defm int_nvvm_read_ptx_sreg_nclusterid - : PTXReadSRegIntrinsicNB_v4i32; -defm int_nvvm_read_ptx_sreg_cluster_ctaid - : PTXReadSRegIntrinsicNB_v4i32; -defm int_nvvm_read_ptx_sreg_cluster_nctaid - : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_clusterid : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_nclusterid : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_cluster_ctaid : PTXReadSRegIntrinsicNB_v4i32; +defm int_nvvm_read_ptx_sreg_cluster_nctaid : PTXReadSRegIntrinsicNB_v4i32; def int_nvvm_read_ptx_sreg_cluster_ctarank : PTXReadSRegIntrinsicNB_r32; def int_nvvm_read_ptx_sreg_cluster_nctarank : PTXReadSRegIntrinsicNB_r32; @@ -1843,13 +1811,13 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] i // // VOTE // - let IntrProperties = [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback] in { def int_nvvm_vote_all : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; def int_nvvm_vote_any : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; def int_nvvm_vote_uni : NVVMBuiltin, Intrinsic<[llvm_i1_ty], [llvm_i1_ty]>; def int_nvvm_vote_ballot : NVVMBuiltin, Intrinsic<[llvm_i32_ty], [llvm_i1_ty]>; } + // // VOTE.SYNC // @@ -2052,8 +2020,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, NoCapture>] in { } def int_nvvm_is_explicit_cluster - : DefaultAttrsIntrinsic<[llvm_i1_ty], [], - [IntrNoMem, IntrSpeculatable, NoUndef], + : PureIntrinsic<[llvm_i1_ty], [], [NoUndef], "llvm.nvvm.is_explicit_cluster">; // Setmaxnreg inc/dec intrinsics @@ -2458,13 +2425,12 @@ def int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared // clusterlaunchcontrol.query_cancel.is_canceled def int_nvvm_clusterlaunchcontrol_query_cancel_is_canceled - : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled">; + : PureIntrinsic<[llvm_i1_ty], [llvm_i128_ty], [], + "llvm.nvvm.clusterlaunchcontrol.query_cancel.is_canceled">; -foreach dim = ["x", "y", "z"] in { -def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim - : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [IntrNoMem, IntrSpeculatable], - "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; -} +foreach dim = ["x", "y", "z"] in + def int_nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_ # dim + : PureIntrinsic<[llvm_i32_ty], [llvm_i128_ty], [], + "llvm.nvvm.clusterlaunchcontrol.query_cancel.get_first_ctaid." # dim>; } // let TargetPrefix = "nvvm" diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td index 9072a0aa1531..9626004cbed4 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.td +++ b/llvm/include/llvm/IR/RuntimeLibcalls.td @@ -406,6 +406,17 @@ multiclass LibmLongDoubleLibCall AArch64LibcallImpls = { def __arm_sc_memcpy : RuntimeLibcallImpl; def __arm_sc_memmove : RuntimeLibcallImpl; def __arm_sc_memset : RuntimeLibcallImpl; + def __arm_sc_memchr : RuntimeLibcallImpl; } // End AArch64LibcallImpls +def __arm_sme_state : RuntimeLibcallImpl; +def __arm_tpidr2_save : RuntimeLibcallImpl; +def __arm_za_disable : RuntimeLibcallImpl; +def __arm_tpidr2_restore : RuntimeLibcallImpl; +def __arm_get_current_vg : RuntimeLibcallImpl; +def __arm_sme_state_size : RuntimeLibcallImpl; +def __arm_sme_save : RuntimeLibcallImpl; +def __arm_sme_restore : RuntimeLibcallImpl; + +def SMEABI_LibCalls_PreserveMost_From_X0 : LibcallsWithCC<(add + __arm_tpidr2_save, + __arm_za_disable, + __arm_tpidr2_restore), + SMEABI_PreserveMost_From_X0>; + +def SMEABI_LibCalls_PreserveMost_From_X1 : LibcallsWithCC<(add + __arm_get_current_vg, + __arm_sme_state_size, + __arm_sme_save, + __arm_sme_restore), + SMEABI_PreserveMost_From_X1>; + +def SMEABI_LibCalls_PreserveMost_From_X2 : LibcallsWithCC<(add + __arm_sme_state), + SMEABI_PreserveMost_From_X2>; + def isAArch64_ExceptArm64EC : RuntimeLibcallPredicate<"(TT.isAArch64() && !TT.isWindowsArm64EC())">; def isWindowsArm64EC : RuntimeLibcallPredicate<"TT.isWindowsArm64EC()">; @@ -1244,7 +1282,10 @@ def AArch64SystemLibrary : SystemRuntimeLibrary< LibmHasSinCosF32, LibmHasSinCosF64, LibmHasSinCosF128, DefaultLibmExp10, DefaultStackProtector, - SecurityCheckCookieIfWinMSVC) + SecurityCheckCookieIfWinMSVC, + SMEABI_LibCalls_PreserveMost_From_X0, + SMEABI_LibCalls_PreserveMost_From_X1, + SMEABI_LibCalls_PreserveMost_From_X2) >; // Prepend a # to every name diff --git a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td index 601c291daf89..b5752c1b69ad 100644 --- a/llvm/include/llvm/IR/RuntimeLibcallsImpl.td +++ b/llvm/include/llvm/IR/RuntimeLibcallsImpl.td @@ -36,6 +36,9 @@ def ARM_AAPCS : LibcallCallingConv<[{CallingConv::ARM_AAPCS}]>; def ARM_AAPCS_VFP : LibcallCallingConv<[{CallingConv::ARM_AAPCS_VFP}]>; def X86_STDCALL : LibcallCallingConv<[{CallingConv::X86_StdCall}]>; def AVR_BUILTIN : LibcallCallingConv<[{CallingConv::AVR_BUILTIN}]>; +def SMEABI_PreserveMost_From_X0 : LibcallCallingConv<[{CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0}]>; +def SMEABI_PreserveMost_From_X1 : LibcallCallingConv<[{CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1}]>; +def SMEABI_PreserveMost_From_X2 : LibcallCallingConv<[{CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2}]>; /// Abstract definition for functionality the compiler may need to /// emit a call to. Emits the RTLIB::Libcall enum - This enum defines diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def index 4a71097226f1..cd7343ff8df5 100644 --- a/llvm/include/llvm/IR/VPIntrinsics.def +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -278,24 +278,28 @@ END_REGISTER_VP(vp_fshr, VP_FSHR) // llvm.vp.sadd.sat(x,y,mask,vlen) BEGIN_REGISTER_VP(vp_sadd_sat, 2, 3, VP_SADDSAT, -1) +VP_PROPERTY_BINARYOP VP_PROPERTY_FUNCTIONAL_INTRINSIC(sadd_sat) VP_PROPERTY_FUNCTIONAL_SDOPC(SADDSAT) END_REGISTER_VP(vp_sadd_sat, VP_SADDSAT) // llvm.vp.uadd.sat(x,y,mask,vlen) BEGIN_REGISTER_VP(vp_uadd_sat, 2, 3, VP_UADDSAT, -1) +VP_PROPERTY_BINARYOP VP_PROPERTY_FUNCTIONAL_INTRINSIC(uadd_sat) VP_PROPERTY_FUNCTIONAL_SDOPC(UADDSAT) END_REGISTER_VP(vp_uadd_sat, VP_UADDSAT) // llvm.vp.ssub.sat(x,y,mask,vlen) BEGIN_REGISTER_VP(vp_ssub_sat, 2, 3, VP_SSUBSAT, -1) +VP_PROPERTY_BINARYOP VP_PROPERTY_FUNCTIONAL_INTRINSIC(ssub_sat) VP_PROPERTY_FUNCTIONAL_SDOPC(SSUBSAT) END_REGISTER_VP(vp_ssub_sat, VP_SSUBSAT) // llvm.vp.usub.sat(x,y,mask,vlen) BEGIN_REGISTER_VP(vp_usub_sat, 2, 3, VP_USUBSAT, -1) +VP_PROPERTY_BINARYOP VP_PROPERTY_FUNCTIONAL_INTRINSIC(usub_sat) VP_PROPERTY_FUNCTIONAL_SDOPC(USUBSAT) END_REGISTER_VP(vp_usub_sat, VP_USUBSAT) diff --git a/llvm/include/llvm/MC/MCSymbol.h b/llvm/include/llvm/MC/MCSymbol.h index ddc560ec5250..cbacd15bf602 100644 --- a/llvm/include/llvm/MC/MCSymbol.h +++ b/llvm/include/llvm/MC/MCSymbol.h @@ -42,11 +42,10 @@ class raw_ostream; class MCSymbol { protected: // A symbol can be regular, equated to an expression, or a common symbol. - enum Contents : uint8_t { - SymContentsUnset, - SymContentsVariable, - SymContentsCommon, - SymContentsTargetCommon, // Index stores the section index + enum Kind : uint8_t { + Regular, + Equated, + Common, }; // Special sentinel value for the absolute pseudo fragment. @@ -65,6 +64,10 @@ protected: /// relative to, if any. mutable MCFragment *Fragment = nullptr; + /// The symbol kind. Use an unsigned bitfield to achieve better bitpacking + /// with MSVC. + unsigned kind : 2; + /// True if this symbol is named. A named symbol will have a pointer to the /// name allocated in the bytes immediately prior to the MCSymbol. unsigned HasName : 1; @@ -95,10 +98,6 @@ protected: /// Used to detect cyclic dependency like `a = a + 1` and `a = b; b = a`. unsigned IsResolving : 1; - /// This is actually a Contents enumerator, but is unsigned to avoid sign - /// extension and achieve better bitpacking with MSVC. - unsigned SymbolContents : 3; - /// The alignment of the symbol if it is 'common'. /// /// Internally, this is stored as log2(align) + 1. @@ -145,9 +144,9 @@ protected: }; MCSymbol(const MCSymbolTableEntry *Name, bool isTemporary) - : IsTemporary(isTemporary), IsRedefinable(false), IsRegistered(false), - IsExternal(false), IsPrivateExtern(false), IsWeakExternal(false), - IsUsedInReloc(false), IsResolving(0), SymbolContents(SymContentsUnset), + : kind(Kind::Regular), IsTemporary(isTemporary), IsRedefinable(false), + IsRegistered(false), IsExternal(false), IsPrivateExtern(false), + IsWeakExternal(false), IsUsedInReloc(false), IsResolving(0), CommonAlignLog2(0), Flags(0) { Offset = 0; HasName = !!Name; @@ -212,11 +211,11 @@ public: /// Prepare this symbol to be redefined. void redefineIfPossible() { if (IsRedefinable) { - if (SymbolContents == SymContentsVariable) { + if (kind == Kind::Equated) { Value = nullptr; - SymbolContents = SymContentsUnset; + kind = Kind::Regular; } - setUndefined(); + Fragment = nullptr; IsRedefinable = false; } } @@ -260,17 +259,12 @@ public: Fragment = F; } - /// Mark the symbol as undefined. - void setUndefined() { Fragment = nullptr; } - /// @} /// \name Variable Symbols /// @{ /// isVariable - Check if this is a variable symbol. - bool isVariable() const { - return SymbolContents == SymContentsVariable; - } + bool isVariable() const { return kind == Equated; } /// Get the expression of the variable symbol. const MCExpr *getVariableValue() const { @@ -293,12 +287,12 @@ public: } uint64_t getOffset() const { - assert(SymbolContents == SymContentsUnset && + assert(kind == Kind::Regular && "Cannot get offset for a common/variable symbol"); return Offset; } void setOffset(uint64_t Value) { - assert(SymbolContents == SymContentsUnset && + assert(kind == Kind::Regular && "Cannot set offset for a common/variable symbol"); Offset = Value; } @@ -314,10 +308,10 @@ public: /// \param Size - The size of the symbol. /// \param Alignment - The alignment of the symbol. /// \param Target - Is the symbol a target-specific common-like symbol. - void setCommon(uint64_t Size, Align Alignment, bool Target = false) { + void setCommon(uint64_t Size, Align Alignment) { assert(getOffset() == 0); CommonSize = Size; - SymbolContents = Target ? SymContentsTargetCommon : SymContentsCommon; + kind = Kind::Common; unsigned Log2Align = encode(Alignment); assert(Log2Align < (1U << NumCommonAlignmentBits) && @@ -335,29 +329,19 @@ public: /// /// \param Size - The size of the symbol. /// \param Alignment - The alignment of the symbol. - /// \param Target - Is the symbol a target-specific common-like symbol. /// \return True if symbol was already declared as a different type - bool declareCommon(uint64_t Size, Align Alignment, bool Target = false) { + bool declareCommon(uint64_t Size, Align Alignment) { assert(isCommon() || getOffset() == 0); if(isCommon()) { - if (CommonSize != Size || getCommonAlignment() != Alignment || - isTargetCommon() != Target) + if (CommonSize != Size || getCommonAlignment() != Alignment) return true; } else - setCommon(Size, Alignment, Target); + setCommon(Size, Alignment); return false; } /// Is this a 'common' symbol. - bool isCommon() const { - return SymbolContents == SymContentsCommon || - SymbolContents == SymContentsTargetCommon; - } - - /// Is this a target-specific common-like symbol. - bool isTargetCommon() const { - return SymbolContents == SymContentsTargetCommon; - } + bool isCommon() const { return kind == Kind::Common; } MCFragment *getFragment() const { if (Fragment || !isVariable() || isWeakExternal()) diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h index ad1b2361ff06..93d39dabae4b 100644 --- a/llvm/include/llvm/Object/DXContainer.h +++ b/llvm/include/llvm/Object/DXContainer.h @@ -603,6 +603,8 @@ private: } public: + const DXContainer &getDXContainer() const { return Container; } + static bool classof(const Binary *v) { return v->isDXContainer(); } Expected getSymbolName(DataRefImpl) const override; diff --git a/llvm/include/llvm/Support/DXILABI.h b/llvm/include/llvm/Support/DXILABI.h index 2dcdd73415be..b25b3632f6c3 100644 --- a/llvm/include/llvm/Support/DXILABI.h +++ b/llvm/include/llvm/Support/DXILABI.h @@ -18,7 +18,6 @@ #define LLVM_SUPPORT_DXILABI_H #include "llvm/ADT/StringRef.h" -#include "llvm/Support/ScopedPrinter.h" #include namespace llvm { @@ -101,8 +100,6 @@ enum class SamplerFeedbackType : uint32_t { const unsigned MinWaveSize = 4; const unsigned MaxWaveSize = 128; -LLVM_ABI ArrayRef> getResourceClasses(); - LLVM_ABI StringRef getResourceClassName(ResourceClass RC); } // namespace dxil diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h index 746926e5bee3..52fe3a6f4baf 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h @@ -366,7 +366,7 @@ public: continue; // Look for an existing PHI. - FindExistingPHI(Info->BB, BlockList); + FindExistingPHI(Info->BB); if (Info->AvailableVal) continue; @@ -412,11 +412,11 @@ public: /// FindExistingPHI - Look through the PHI nodes in a block to see if any of /// them match what is needed. - void FindExistingPHI(BlkT *BB, BlockListTy *BlockList) { + void FindExistingPHI(BlkT *BB) { SmallVector TaggedBlocks; for (auto &SomePHI : BB->phis()) { if (CheckIfPHIMatches(&SomePHI, TaggedBlocks)) { - RecordMatchingPHIs(BlockList); + RecordMatchingPHIs(TaggedBlocks); break; } } @@ -424,7 +424,7 @@ public: /// CheckIfPHIMatches - Check if a PHI node matches the placement and values /// in the BBMap. - bool CheckIfPHIMatches(PhiT *PHI, SmallVectorImpl &TaggedBlocks) { + bool CheckIfPHIMatches(PhiT *PHI, BlockListTy &TaggedBlocks) { // Match failed: clear all the PHITag values. Only need to clear visited // blocks. auto Cleanup = make_scope_exit([&]() { @@ -484,15 +484,15 @@ public: /// RecordMatchingPHIs - For each PHI node that matches, record it in both /// the BBMap and the AvailableVals mapping. - void RecordMatchingPHIs(BlockListTy *BlockList) { - for (typename BlockListTy::iterator I = BlockList->begin(), - E = BlockList->end(); I != E; ++I) - if (PhiT *PHI = (*I)->PHITag) { - BlkT *BB = PHI->getParent(); - ValT PHIVal = Traits::GetPHIValue(PHI); - (*AvailableVals)[BB] = PHIVal; - BBMap[BB]->AvailableVal = PHIVal; - } + void RecordMatchingPHIs(BlockListTy &TaggedBlocks) { + for (BBInfo *Block : TaggedBlocks) { + PhiT *PHI = Block->PHITag; + assert(PHI && "PHITag didn't set?"); + BlkT *BB = PHI->getParent(); + ValT PHIVal = Traits::GetPHIValue(PHI); + (*AvailableVals)[BB] = PHIVal; + BBMap[BB]->AvailableVal = PHIVal; + } } }; diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp index 672dae1642cb..99d8b11f0c4b 100644 --- a/llvm/lib/Analysis/CallPrinter.cpp +++ b/llvm/lib/Analysis/CallPrinter.cpp @@ -70,7 +70,7 @@ public: for (Function &F : M->getFunctionList()) { uint64_t localSumFreq = 0; - SmallSet Callers; + SmallPtrSet Callers; for (User *U : F.users()) if (isa(U)) Callers.insert(cast(U)->getFunction()); @@ -99,7 +99,7 @@ private: bool FoundParallelEdge = true; while (FoundParallelEdge) { - SmallSet Visited; + SmallPtrSet Visited; FoundParallelEdge = false; for (auto CI = Node->begin(), CE = Node->end(); CI != CE; CI++) { if (!(Visited.insert(CI->second->getFunction())).second) { diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index bd0d417b1ed3..b6acda3a9f25 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -405,7 +405,7 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, SmallVector Worklist; Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking()); - SmallSet Visited; + SmallPtrSet Visited; auto AddUses = [&](const Value *V) { for (const Use &U : V->uses()) { diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 922f25de54e9..c7b0ca97a8e4 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -927,8 +927,13 @@ LazyValueInfoImpl::solveBlockValueCast(CastInst *CI, BasicBlock *BB) { // NOTE: We're currently limited by the set of operations that ConstantRange // can evaluate symbolically. Enhancing that set will allows us to analyze // more definitions. - return ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(), - ResultBitWidth)); + ConstantRange Res = ConstantRange::getEmpty(ResultBitWidth); + if (auto *Trunc = dyn_cast(CI)) + Res = LHSRange.truncate(ResultBitWidth, Trunc->getNoWrapKind()); + else + Res = LHSRange.castOp(CI->getOpcode(), ResultBitWidth); + + return ValueLatticeElement::getRange(Res); } std::optional diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index ce4d4ad7a0ab..d2c445f1ffaa 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -7284,7 +7284,7 @@ ScalarEvolution::getDefiningScopeBound(ArrayRef Ops, bool &Precise) { Precise = true; // Do a bounded search of the def relation of the requested SCEVs. - SmallSet Visited; + SmallPtrSet Visited; SmallVector Worklist; auto pushOp = [&](const SCEV *S) { if (!Visited.insert(S).second) diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index b0e4b009f350..50e43a53def6 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7785,7 +7785,7 @@ bool llvm::mustExecuteUBIfPoisonOnPathTo(Instruction *Root, // The set of all recursive users we've visited (which are assumed to all be // poison because of said visit) - SmallSet KnownPoison; + SmallPtrSet KnownPoison; SmallVector Worklist; Worklist.push_back(Root); while (!Worklist.empty()) { @@ -8140,8 +8140,8 @@ static bool programUndefinedIfUndefOrPoison(const Value *V, // Set of instructions that we have proved will yield poison if Inst // does. - SmallSet YieldsPoison; - SmallSet Visited; + SmallPtrSet YieldsPoison; + SmallPtrSet Visited; YieldsPoison.insert(V); Visited.insert(BB); diff --git a/llvm/lib/BinaryFormat/MsgPackDocument.cpp b/llvm/lib/BinaryFormat/MsgPackDocument.cpp index 11598ee24d6f..b52f02912244 100644 --- a/llvm/lib/BinaryFormat/MsgPackDocument.cpp +++ b/llvm/lib/BinaryFormat/MsgPackDocument.cpp @@ -104,6 +104,10 @@ DocNode &DocNode::operator=(uint64_t Val) { *this = getDocument()->getNode(Val); return *this; } +DocNode &DocNode::operator=(double Val) { + *this = getDocument()->getNode(Val); + return *this; +} // A level in the document reading stack. struct StackLevel { @@ -293,6 +297,9 @@ void Document::writeToBlob(std::string &Blob) { case Type::Binary: MPWriter.write(Node.getBinary()); break; + case Type::Float: + MPWriter.write(Node.getFloat()); + break; case Type::Empty: llvm_unreachable("unhandled empty msgpack node"); default: diff --git a/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp index e38068f3bd3c..8ceb1147ee6c 100644 --- a/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp +++ b/llvm/lib/CAS/MappedFileRegionBumpPtr.cpp @@ -1,11 +1,11 @@ -//===- MappedFileRegionBumpPtr.cpp ------------------------------------===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -/// \file +/// \file Implements MappedFileRegionBumpPtr. /// /// A bump pointer allocator, backed by a memory-mapped file. /// @@ -35,7 +35,7 @@ /// which typically loses sparseness. These mitigations only work while the file /// is not in use. /// -/// FIXME: we assume that all concurrent users of the file will use the same +/// TODO: we assume that all concurrent users of the file will use the same /// value for Capacity. Otherwise a process with a larger capacity can write /// data that is "out of bounds" for processes with smaller capacity. Currently /// this is true in the CAS. @@ -152,7 +152,7 @@ Expected MappedFileRegionBumpPtr::create( // Retrieve the current size now that we have exclusive access. FileSize = FileSizeInfo::get(File); if (!FileSize) - return createFileError(Result.Path, FileSize.getError()); + return createFileError(Result.Path, FileSize.getError()); } // At this point either the file is still under-sized, or we have the size for @@ -282,7 +282,8 @@ Expected MappedFileRegionBumpPtr::allocateOffset(uint64_t AllocSize) { int64_t NewSize; // The minimum increment is a page, but allocate more to amortize the cost. constexpr int64_t Increment = 1 * 1024 * 1024; // 1 MB - if (Error E = preallocateFileTail(*FD, DiskSize, DiskSize + Increment).moveInto(NewSize)) + if (Error E = preallocateFileTail(*FD, DiskSize, DiskSize + Increment) + .moveInto(NewSize)) return std::move(E); assert(NewSize >= DiskSize + Increment); // FIXME: on Darwin this can under-count the size if there is a race to diff --git a/llvm/lib/CAS/OnDiskCommon.cpp b/llvm/lib/CAS/OnDiskCommon.cpp index 079954bc0a3c..d17ab0c15b92 100644 --- a/llvm/lib/CAS/OnDiskCommon.cpp +++ b/llvm/lib/CAS/OnDiskCommon.cpp @@ -79,21 +79,22 @@ cas::ondisk::tryLockFileThreadSafe(int FD, std::chrono::milliseconds Timeout, #endif } -Expected cas::ondisk::preallocateFileTail(int FD, size_t CurrentSize, size_t NewSize) { +Expected cas::ondisk::preallocateFileTail(int FD, size_t CurrentSize, + size_t NewSize) { auto CreateError = [&](std::error_code EC) -> Expected { if (EC == std::errc::not_supported) // Ignore ENOTSUP in case the filesystem cannot preallocate. return NewSize; #if defined(HAVE_POSIX_FALLOCATE) - if (EC == std::errc::invalid_argument && - CurrentSize < NewSize && // len > 0 + if (EC == std::errc::invalid_argument && CurrentSize < NewSize && // len > 0 NewSize < std::numeric_limits::max()) // 0 <= offset, len < max // Prior to 2024, POSIX required EINVAL for cases that should be ENOTSUP, // so handle it the same as above if it is not one of the other ways to // get EINVAL. return NewSize; #endif - return createStringError(EC, "failed to allocate to CAS file: " + EC.message()); + return createStringError(EC, + "failed to allocate to CAS file: " + EC.message()); }; #if defined(HAVE_POSIX_FALLOCATE) // Note: posix_fallocate returns its error directly, not via errno. diff --git a/llvm/lib/CAS/OnDiskCommon.h b/llvm/lib/CAS/OnDiskCommon.h index 3fef23ace646..9c78cde6b197 100644 --- a/llvm/lib/CAS/OnDiskCommon.h +++ b/llvm/lib/CAS/OnDiskCommon.h @@ -37,7 +37,8 @@ std::error_code tryLockFileThreadSafe( /// \c std::errc::no_space_on_device are detected before we write data. /// /// \returns the new size of the file, or an \c Error. -Expected preallocateFileTail(int FD, size_t CurrentSize, size_t NewSize); +Expected preallocateFileTail(int FD, size_t CurrentSize, + size_t NewSize); } // namespace llvm::cas::ondisk diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 9223739fc009..0e40a92fd8d6 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -377,7 +377,7 @@ public: /// to be optimized again. /// Note: Consider building time in this pass, when a BB updated, we need /// to insert such BB into FreshBBs for huge function. - SmallSet FreshBBs; + SmallPtrSet FreshBBs; void releaseMemory() { // Clear per function information. @@ -1105,7 +1105,7 @@ bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB, /// Replace all old uses with new ones, and push the updated BBs into FreshBBs. static void replaceAllUsesWith(Value *Old, Value *New, - SmallSet &FreshBBs, + SmallPtrSet &FreshBBs, bool IsHuge) { auto *OldI = dyn_cast(Old); if (OldI) { @@ -2135,7 +2135,7 @@ static bool isRemOfLoopIncrementWithLoopInvariant( // Rem = rem == RemAmtLoopInvariant ? 0 : Rem; static bool foldURemOfLoopIncrement(Instruction *Rem, const DataLayout *DL, const LoopInfo *LI, - SmallSet &FreshBBs, + SmallPtrSet &FreshBBs, bool IsHuge) { Value *AddOffset, *RemAmt, *AddInst; PHINode *LoopIncrPN; @@ -2534,11 +2534,10 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, /// %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ] /// /// If the transform is performed, return true and set ModifiedDT to true. -static bool despeculateCountZeros(IntrinsicInst *CountZeros, - LoopInfo &LI, +static bool despeculateCountZeros(IntrinsicInst *CountZeros, LoopInfo &LI, const TargetLowering *TLI, const DataLayout *DL, ModifyDT &ModifiedDT, - SmallSet &FreshBBs, + SmallPtrSet &FreshBBs, bool IsHugeFunc) { // If a zero input is undefined, it doesn't make sense to despeculate that. if (match(CountZeros->getOperand(1), m_One())) @@ -4351,7 +4350,7 @@ private: PhiNodeSet &PhiNodesToMatch) { SmallVector WorkList; Matcher.insert({PHI, Candidate}); - SmallSet MatchedPHIs; + SmallPtrSet MatchedPHIs; MatchedPHIs.insert(PHI); WorkList.push_back({PHI, Candidate}); SmallSet Visited; @@ -8635,7 +8634,7 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI, } static bool optimizeBranch(BranchInst *Branch, const TargetLowering &TLI, - SmallSet &FreshBBs, + SmallPtrSet &FreshBBs, bool IsHugeFunc) { // Try and convert // %c = icmp ult %x, 8 diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 64c19fab1a02..8424a8108d76 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -2522,6 +2522,9 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, Opc = ID == Intrinsic::vector_reduce_fadd ? TargetOpcode::G_VECREDUCE_SEQ_FADD : TargetOpcode::G_VECREDUCE_SEQ_FMUL; + if (!MRI->getType(VecSrc).isVector()) + Opc = ID == Intrinsic::vector_reduce_fadd ? TargetOpcode::G_FADD + : TargetOpcode::G_FMUL; MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc}, MachineInstr::copyFlagsFromInstruction(CI)); return true; @@ -3517,7 +3520,7 @@ void IRTranslator::finishPendingPhis() { Verifier.setCurrentInst(PI); #endif // ifndef NDEBUG - SmallSet SeenPreds; + SmallPtrSet SeenPreds; for (unsigned i = 0; i < PI->getNumIncomingValues(); ++i) { auto IRPred = PI->getIncomingBlock(i); ArrayRef ValRegs = getOrCreateVRegs(*PI->getIncomingValue(i)); diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp index 1f23418642bc..c5dfddaa21e6 100644 --- a/llvm/lib/CodeGen/LiveVariables.cpp +++ b/llvm/lib/CodeGen/LiveVariables.cpp @@ -213,11 +213,7 @@ void LiveVariables::HandleVirtRegDef(Register Reg, MachineInstr &MI) { } /// FindLastPartialDef - Return the last partial def of the specified register. -/// Also returns the sub-registers that're defined by the instruction. -MachineInstr * -LiveVariables::FindLastPartialDef(Register Reg, - SmallSet &PartDefRegs) { - Register LastDefReg = 0; +MachineInstr *LiveVariables::FindLastPartialDef(Register Reg) { unsigned LastDefDist = 0; MachineInstr *LastDef = nullptr; for (MCPhysReg SubReg : TRI->subregs(Reg)) { @@ -226,7 +222,6 @@ LiveVariables::FindLastPartialDef(Register Reg, continue; unsigned Dist = DistanceMap[Def]; if (Dist > LastDefDist) { - LastDefReg = SubReg; LastDef = Def; LastDefDist = Dist; } @@ -235,14 +230,6 @@ LiveVariables::FindLastPartialDef(Register Reg, if (!LastDef) return nullptr; - PartDefRegs.insert(LastDefReg); - for (MachineOperand &MO : LastDef->all_defs()) { - if (MO.getReg() == 0) - continue; - Register DefReg = MO.getReg(); - if (TRI->isSubRegister(Reg, DefReg)) - PartDefRegs.insert_range(TRI->subregs_inclusive(DefReg)); - } return LastDef; } @@ -261,27 +248,11 @@ void LiveVariables::HandlePhysRegUse(Register Reg, MachineInstr &MI) { // ... // = EAX // All of the sub-registers must have been defined before the use of Reg! - SmallSet PartDefRegs; - MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs); + MachineInstr *LastPartialDef = FindLastPartialDef(Reg); // If LastPartialDef is NULL, it must be using a livein register. if (LastPartialDef) { - LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/, - true/*IsImp*/)); - PhysRegDef[Reg.id()] = LastPartialDef; - SmallSet Processed; - for (MCPhysReg SubReg : TRI->subregs(Reg)) { - if (Processed.count(SubReg)) - continue; - if (PartDefRegs.count(SubReg)) - continue; - // This part of Reg was defined before the last partial def. It's killed - // here. - LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg, - false/*IsDef*/, - true/*IsImp*/)); - PhysRegDef[SubReg] = LastPartialDef; - Processed.insert_range(TRI->subregs(SubReg)); - } + LastPartialDef->addOperand( + MachineOperand::CreateReg(Reg, /*IsDef=*/true, /*IsImp=*/true)); } } else if (LastDef && !PhysRegUse[Reg.id()] && !LastDef->findRegisterDefOperand(Reg, /*TRI=*/nullptr)) diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp index 742de1101faa..e35983138550 100644 --- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp +++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp @@ -490,7 +490,7 @@ private: SmallSetVector MaybeDeadCopies; /// Multimap tracking debug users in current BB - DenseMap> CopyDbgUsers; + DenseMap> CopyDbgUsers; CopyTracker Tracker; diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp index 1a20fe586e95..307f49468eb3 100644 --- a/llvm/lib/CodeGen/MachineDebugify.cpp +++ b/llvm/lib/CodeGen/MachineDebugify.cpp @@ -87,7 +87,7 @@ bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI, // Do this by introducing debug uses of each register definition. If that is // not possible (e.g. we have a phi or a meta instruction), emit a constant. uint64_t NextImm = 0; - SmallSet VarSet; + SmallPtrSet VarSet; const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE); for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI(); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 90005bd181f3..3a9651c5cee0 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -3466,9 +3466,9 @@ bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds( } /// Determine transitive dependences of unpipelineable instructions -SmallSet SMSchedule::computeUnpipelineableNodes( +SmallPtrSet SMSchedule::computeUnpipelineableNodes( SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { - SmallSet DoNotPipeline; + SmallPtrSet DoNotPipeline; SmallVector Worklist; for (auto &SU : SSD->SUnits) @@ -3498,7 +3498,7 @@ SmallSet SMSchedule::computeUnpipelineableNodes( // and ensure that they are in stage 0. If unable to do so, return false. bool SMSchedule::normalizeNonPipelinedInstructions( SwingSchedulerDAG *SSD, TargetInstrInfo::PipelinerLoopInfo *PLI) { - SmallSet DNP = computeUnpipelineableNodes(SSD, PLI); + SmallPtrSet DNP = computeUnpipelineableNodes(SSD, PLI); int NewLastCycle = INT_MIN; for (SUnit &SU : SSD->SUnits) { diff --git a/llvm/lib/CodeGen/MacroFusion.cpp b/llvm/lib/CodeGen/MacroFusion.cpp index 975a3fe71aba..1db53017e6ce 100644 --- a/llvm/lib/CodeGen/MacroFusion.cpp +++ b/llvm/lib/CodeGen/MacroFusion.cpp @@ -79,7 +79,7 @@ bool llvm::fuseInstructionPair(ScheduleDAGInstrs &DAG, SUnit &FirstSU, FirstSU.ParentClusterIdx = Clusters.size(); SecondSU.ParentClusterIdx = Clusters.size(); - SmallSet Cluster{{&FirstSU, &SecondSU}}; + SmallPtrSet Cluster{{&FirstSU, &SecondSU}}; Clusters.push_back(Cluster); // TODO - If we want to chain more than two instructions, we need to create diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 6eb8468e2573..c16ccaf926bc 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16279,6 +16279,40 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // because targets may prefer a wider type during later combines and invert // this transform. switch (N0.getOpcode()) { + case ISD::AVGCEILU: + case ISD::AVGFLOORU: + if (!LegalOperations && N0.hasOneUse() && + TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDValue X = N0.getOperand(0); + SDValue Y = N0.getOperand(1); + unsigned SrcBits = X.getScalarValueSizeInBits(); + unsigned DstBits = VT.getScalarSizeInBits(); + APInt UpperBits = APInt::getBitsSetFrom(SrcBits, DstBits); + if (DAG.MaskedValueIsZero(X, UpperBits) && + DAG.MaskedValueIsZero(Y, UpperBits)) { + SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); + SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); + return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); + } + } + break; + case ISD::AVGCEILS: + case ISD::AVGFLOORS: + if (!LegalOperations && N0.hasOneUse() && + TLI.isOperationLegal(N0.getOpcode(), VT)) { + SDValue X = N0.getOperand(0); + SDValue Y = N0.getOperand(1); + unsigned SrcBits = X.getScalarValueSizeInBits(); + unsigned DstBits = VT.getScalarSizeInBits(); + unsigned NeededSignBits = SrcBits - DstBits + 1; + if (DAG.ComputeNumSignBits(X) >= NeededSignBits && + DAG.ComputeNumSignBits(Y) >= NeededSignBits) { + SDValue Tx = DAG.getNode(ISD::TRUNCATE, DL, VT, X); + SDValue Ty = DAG.getNode(ISD::TRUNCATE, DL, VT, Y); + return DAG.getNode(N0.getOpcode(), DL, VT, Tx, Ty); + } + } + break; case ISD::ADD: case ISD::SUB: case ISD::MUL: @@ -16329,30 +16363,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { break; case ISD::ABDU: case ISD::ABDS: - // (trunc (abdu/abds a, b)) → (abdu/abds (trunc a), (trunc b)) - if (!LegalOperations || N0.hasOneUse()) { - EVT SrcVT = N0.getValueType(); + // (trunc (abdu/abds a, b)) -> (abdu/abds (trunc a), (trunc b)) + if ((!LegalOperations || N0.hasOneUse()) && + TLI.isOperationLegal(N0.getOpcode(), VT)) { EVT TruncVT = VT; unsigned SrcBits = SrcVT.getScalarSizeInBits(); unsigned TruncBits = TruncVT.getScalarSizeInBits(); - unsigned NeededBits = SrcBits - TruncBits; SDValue A = N0.getOperand(0); SDValue B = N0.getOperand(1); bool CanFold = false; if (N0.getOpcode() == ISD::ABDU) { - KnownBits KnownA = DAG.computeKnownBits(A); - KnownBits KnownB = DAG.computeKnownBits(B); - CanFold = KnownA.countMinLeadingZeros() >= NeededBits && - KnownB.countMinLeadingZeros() >= NeededBits; + APInt UpperBits = APInt::getBitsSetFrom(SrcBits, TruncBits); + CanFold = DAG.MaskedValueIsZero(B, UpperBits) && + DAG.MaskedValueIsZero(A, UpperBits); } else { - unsigned SignBitsA = DAG.ComputeNumSignBits(A); - unsigned SignBitsB = DAG.ComputeNumSignBits(B); - CanFold = SignBitsA > NeededBits && SignBitsB > NeededBits; + unsigned NeededBits = SrcBits - TruncBits; + CanFold = DAG.ComputeNumSignBits(B) > NeededBits && + DAG.ComputeNumSignBits(A) > NeededBits; } - if (CanFold && TLI.isOperationLegal(N0.getOpcode(), VT)) { + if (CanFold) { SDValue NewA = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, A); SDValue NewB = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, B); return DAG.getNode(N0.getOpcode(), DL, TruncVT, NewA, NewB); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2a1ef2b980ac..901f10d1256d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1837,11 +1837,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) { getValue(CPA->getDiscriminator())); } - if (isa(C)) { - unsigned AS = V->getType()->getPointerAddressSpace(); - return DAG.getConstant(0, getCurSDLoc(), - TLI.getPointerTy(DAG.getDataLayout(), AS)); - } + if (isa(C)) + return DAG.getConstant(0, getCurSDLoc(), VT); if (match(C, m_VScale())) return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1)); @@ -3576,7 +3573,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) { MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB; // Update machine-CFG edges with unique successors. - SmallSet Done; + SmallPtrSet Done; for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) { BasicBlock *BB = I.getSuccessor(i); bool Inserted = Done.insert(BB).second; @@ -10998,11 +10995,17 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { SmallVector RetOrigTys; SmallVector Offsets; auto &DL = CLI.DAG.getDataLayout(); - ComputeValueTypes(DL, CLI.RetTy, RetOrigTys, &Offsets); + ComputeValueTypes(DL, CLI.OrigRetTy, RetOrigTys, &Offsets); SmallVector RetVTs; - for (Type *Ty : RetOrigTys) - RetVTs.push_back(getValueType(DL, Ty)); + if (CLI.RetTy != CLI.OrigRetTy) { + assert(RetOrigTys.size() == 1 && + "Only supported for non-aggregate returns"); + RetVTs.push_back(getValueType(DL, CLI.RetTy)); + } else { + for (Type *Ty : RetOrigTys) + RetVTs.push_back(getValueType(DL, Ty)); + } if (CLI.IsPostTypeLegalization) { // If we are lowering a libcall after legalization, split the return type. @@ -11053,7 +11056,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { CLI.getArgs().insert(CLI.getArgs().begin(), Entry); CLI.NumFixedArgs += 1; CLI.getArgs()[0].IndirectType = CLI.RetTy; - CLI.RetTy = Type::getVoidTy(Context); + CLI.RetTy = CLI.OrigRetTy = Type::getVoidTy(Context); // sret demotion isn't compatible with tail-calls, since the sret argument // points into the callers stack frame. @@ -11110,17 +11113,23 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { CLI.Outs.clear(); CLI.OutVals.clear(); for (unsigned i = 0, e = Args.size(); i != e; ++i) { - SmallVector ArgTys; - ComputeValueTypes(DL, Args[i].Ty, ArgTys); + SmallVector OrigArgTys; + ComputeValueTypes(DL, Args[i].OrigTy, OrigArgTys); // FIXME: Split arguments if CLI.IsPostTypeLegalization Type *FinalType = Args[i].Ty; if (Args[i].IsByVal) FinalType = Args[i].IndirectType; bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters( FinalType, CLI.CallConv, CLI.IsVarArg, DL); - for (unsigned Value = 0, NumValues = ArgTys.size(); Value != NumValues; + for (unsigned Value = 0, NumValues = OrigArgTys.size(); Value != NumValues; ++Value) { - Type *ArgTy = ArgTys[Value]; + Type *OrigArgTy = OrigArgTys[Value]; + Type *ArgTy = OrigArgTy; + if (Args[i].Ty != Args[i].OrigTy) { + assert(Value == 0 && "Only supported for non-aggregate arguments"); + ArgTy = Args[i].Ty; + } + EVT VT = getValueType(DL, ArgTy); SDValue Op = SDValue(Args[i].Node.getNode(), Args[i].Node.getResNo() + Value); @@ -11254,7 +11263,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const { // For scalable vectors the scalable part is currently handled // by individual targets, so we just use the known minimum size here. ISD::OutputArg MyFlags( - Flags, Parts[j].getValueType().getSimpleVT(), VT, ArgTy, i, + Flags, Parts[j].getValueType().getSimpleVT(), VT, OrigArgTy, i, j * Parts[j].getValueType().getStoreSize().getKnownMinValue()); if (NumParts > 1 && j == 0) MyFlags.Flags.setSplit(); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index ca10a6ecb456..402a012e8e55 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -169,6 +169,10 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ? OpsTypeOverrides[i] : NewOp.getValueType().getTypeForEVT(*DAG.getContext()); TargetLowering::ArgListEntry Entry(NewOp, Ty); + if (CallOptions.IsSoften) + Entry.OrigTy = + CallOptions.OpsVTBeforeSoften[i].getTypeForEVT(*DAG.getContext()); + Entry.IsSExt = shouldSignExtendTypeInLibCall(Entry.Ty, CallOptions.IsSigned); Entry.IsZExt = !Entry.IsSExt; @@ -188,18 +192,21 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext()); + Type *OrigRetTy = RetTy; TargetLowering::CallLoweringInfo CLI(DAG); bool signExtend = shouldSignExtendTypeInLibCall(RetTy, CallOptions.IsSigned); bool zeroExtend = !signExtend; - if (CallOptions.IsSoften && - !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) { - signExtend = zeroExtend = false; + if (CallOptions.IsSoften) { + OrigRetTy = CallOptions.RetVTBeforeSoften.getTypeForEVT(*DAG.getContext()); + if (!shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) + signExtend = zeroExtend = false; } CLI.setDebugLoc(dl) .setChain(InChain) - .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) + .setLibCallee(getLibcallCallingConv(LC), RetTy, OrigRetTy, Callee, + std::move(Args)) .setNoReturn(CallOptions.DoesNotReturn) .setDiscardResult(!CallOptions.IsReturnValueUsed) .setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization) diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp index decffdc7dfe4..ff4b568b5ee2 100644 --- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp +++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp @@ -179,7 +179,7 @@ void SwiftErrorValueTracking::propagateVRegs() { // Check whether we have a single vreg def from all predecessors. // Otherwise we need a phi. SmallVector, 4> VRegs; - SmallSet Visited; + SmallPtrSet Visited; for (auto *Pred : MBB->predecessors()) { if (!Visited.insert(Pred).second) continue; diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp index 1bafed79d696..ba27aa87b7c7 100644 --- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp @@ -64,7 +64,7 @@ public: LLVM_DEBUG({ dbgs() << " Preserving debug section " << Sec.getName() << "\n"; }); - SmallSet PreservedBlocks; + SmallPtrSet PreservedBlocks; for (auto *Sym : Sec.symbols()) { bool NewPreservedBlock = PreservedBlocks.insert(&Sym->getBlock()).second; diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 08d6c78bd1eb..d6268037dea8 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -654,11 +654,10 @@ bool RuntimeDyldELF::resolveLoongArch64ShortBranch( if (Loc == GlobalSymbolTable.end()) return false; const auto &SymInfo = Loc->second; - Address = - uint64_t(Sections[SymInfo.getSectionID()].getLoadAddressWithOffset( - SymInfo.getOffset())); + Address = Sections[SymInfo.getSectionID()].getLoadAddressWithOffset( + SymInfo.getOffset()); } else { - Address = uint64_t(Sections[Value.SectionID].getLoadAddress()); + Address = Sections[Value.SectionID].getLoadAddress(); } uint64_t Offset = RelI->getOffset(); uint64_t SourceAddress = Sections[SectionID].getLoadAddressWithOffset(Offset); diff --git a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp index 050cc46e8c9b..92c62b83fadb 100644 --- a/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp +++ b/llvm/lib/Frontend/HLSL/HLSLRootSignature.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Frontend/HLSL/HLSLRootSignature.h" +#include "llvm/Support/DXILABI.h" #include "llvm/Support/ScopedPrinter.h" namespace llvm { @@ -92,10 +93,9 @@ static raw_ostream &operator<<(raw_ostream &OS, return OS; } -static raw_ostream &operator<<(raw_ostream &OS, const ClauseType &Type) { - OS << enumToStringRef(dxil::ResourceClass(llvm::to_underlying(Type)), - dxil::getResourceClasses()); - +static raw_ostream &operator<<(raw_ostream &OS, + const dxil::ResourceClass &Type) { + OS << dxil::getResourceClassName(Type); return OS; } @@ -153,8 +153,7 @@ raw_ostream &operator<<(raw_ostream &OS, const DescriptorTableClause &Clause) { } raw_ostream &operator<<(raw_ostream &OS, const RootDescriptor &Descriptor) { - ClauseType Type = ClauseType(llvm::to_underlying(Descriptor.Type)); - OS << "Root" << Type << "(" << Descriptor.Reg + OS << "Root" << Descriptor.Type << "(" << Descriptor.Reg << ", space = " << Descriptor.Space << ", visibility = " << Descriptor.Visibility << ", flags = " << Descriptor.Flags << ")"; diff --git a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp index 157bfc665b20..dece8f197aaf 100644 --- a/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp +++ b/llvm/lib/Frontend/HLSL/RootSignatureMetadata.cpp @@ -15,6 +15,7 @@ #include "llvm/Frontend/HLSL/RootSignatureValidations.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/Support/DXILABI.h" #include "llvm/Support/ScopedPrinter.h" using namespace llvm; @@ -119,9 +120,7 @@ MDNode *MetadataBuilder::BuildRootConstants(const RootConstants &Constants) { MDNode *MetadataBuilder::BuildRootDescriptor(const RootDescriptor &Descriptor) { IRBuilder<> Builder(Ctx); - StringRef ResName = - enumToStringRef(dxil::ResourceClass(to_underlying(Descriptor.Type)), - dxil::getResourceClasses()); + StringRef ResName = dxil::getResourceClassName(Descriptor.Type); assert(!ResName.empty() && "Provided an invalid Resource Class"); SmallString<7> Name({"Root", ResName}); Metadata *Operands[] = { @@ -161,9 +160,7 @@ MDNode *MetadataBuilder::BuildDescriptorTable(const DescriptorTable &Table) { MDNode *MetadataBuilder::BuildDescriptorTableClause( const DescriptorTableClause &Clause) { IRBuilder<> Builder(Ctx); - StringRef ResName = - enumToStringRef(dxil::ResourceClass(to_underlying(Clause.Type)), - dxil::getResourceClasses()); + StringRef ResName = dxil::getResourceClassName(Clause.Type); assert(!ResName.empty() && "Provided an invalid Resource Class"); Metadata *Operands[] = { MDString::get(Ctx, ResName), diff --git a/llvm/lib/Frontend/OpenMP/OMP.cpp b/llvm/lib/Frontend/OpenMP/OMP.cpp index 555e2a61e411..9e625b809de9 100644 --- a/llvm/lib/Frontend/OpenMP/OMP.cpp +++ b/llvm/lib/Frontend/OpenMP/OMP.cpp @@ -190,7 +190,7 @@ bool isCombinedConstruct(Directive D) { } ArrayRef getOpenMPVersions() { - static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60}; + static unsigned Versions[]{31, 40, 45, 50, 51, 52, 60, 61}; return Versions; } diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index c16b0dde1a3d..e9147a42452d 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -5930,7 +5930,7 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop, createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd"); } - SmallSet Reachable; + SmallPtrSet Reachable; // Get the basic blocks from the loop in which memref instructions // can be found. diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index b91fd70bd946..e200f3626e69 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5391,7 +5391,7 @@ void llvm::UpgradeNVVMAnnotations(Module &M) { return; SmallVector NewNodes; - SmallSet SeenNodes; + SmallPtrSet SeenNodes; for (MDNode *MD : NamedMD->operands()) { if (!SeenNodes.insert(MD).second) continue; diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index 0b7923248aa7..5e87b5ff941a 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -552,14 +552,19 @@ void Instruction::dropUBImplyingAttrsAndUnknownMetadata( CB->removeRetAttrs(UBImplyingAttributes); } -void Instruction::dropUBImplyingAttrsAndMetadata() { +void Instruction::dropUBImplyingAttrsAndMetadata(ArrayRef Keep) { // !annotation metadata does not impact semantics. // !range, !nonnull and !align produce poison, so they are safe to speculate. // !noundef and various AA metadata must be dropped, as it generally produces // immediate undefined behavior. - unsigned KnownIDs[] = {LLVMContext::MD_annotation, LLVMContext::MD_range, - LLVMContext::MD_nonnull, LLVMContext::MD_align}; - dropUBImplyingAttrsAndUnknownMetadata(KnownIDs); + static const unsigned KnownIDs[] = { + LLVMContext::MD_annotation, LLVMContext::MD_range, + LLVMContext::MD_nonnull, LLVMContext::MD_align}; + SmallVector KeepIDs; + KeepIDs.reserve(Keep.size() + std::size(KnownIDs)); + append_range(KeepIDs, KnownIDs); + append_range(KeepIDs, Keep); + dropUBImplyingAttrsAndUnknownMetadata(KeepIDs); } bool Instruction::hasUBImplyingAttrs() const { diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 5a93228faa3a..9d9b51db9870 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4636,7 +4636,7 @@ void Verifier::visitEHPadPredecessors(Instruction &I) { } // The edge may exit from zero or more nested pads. - SmallSet Seen; + SmallPtrSet Seen; for (;; FromPad = getParentPad(FromPad)) { Check(FromPad != ToPad, "EH pad cannot handle exceptions raised within it", FromPad, TI); @@ -4764,7 +4764,7 @@ void Verifier::visitFuncletPadInst(FuncletPadInst &FPI) { User *FirstUser = nullptr; Value *FirstUnwindPad = nullptr; SmallVector Worklist({&FPI}); - SmallSet Seen; + SmallPtrSet Seen; while (!Worklist.empty()) { FuncletPadInst *CurrentPad = Worklist.pop_back_val(); diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp index 8f3814a1dd62..759d3e0e1429 100644 --- a/llvm/lib/MC/ELFObjectWriter.cpp +++ b/llvm/lib/MC/ELFObjectWriter.cpp @@ -541,12 +541,12 @@ void ELFWriter::computeSymbolTable(const RevGroupMapTy &RevGroupMap) { if (Symbol.isAbsolute()) { MSD.SectionIndex = ELF::SHN_ABS; } else if (Symbol.isCommon()) { - if (Symbol.isTargetCommon()) { - MSD.SectionIndex = Symbol.getIndex(); - } else { + auto Shndx = Symbol.getIndex(); + if (!Shndx) { assert(!Local); - MSD.SectionIndex = ELF::SHN_COMMON; + Shndx = ELF::SHN_COMMON; } + MSD.SectionIndex = Shndx; } else if (Symbol.isUndefined()) { if (Symbol.isSignature() && !Symbol.isUsedInReloc()) { MSD.SectionIndex = RevGroupMap.lookup(&Symbol); diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp index 3b629cdb87fc..d68f4af86752 100644 --- a/llvm/lib/MC/GOFFObjectWriter.cpp +++ b/llvm/lib/MC/GOFFObjectWriter.cpp @@ -17,7 +17,6 @@ #include "llvm/MC/MCSectionGOFF.h" #include "llvm/MC/MCSymbolGOFF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ConvertEBCDIC.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Endian.h" diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index 9a5e07095fa5..89e541ac0185 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -28,7 +28,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Format.h" #include "llvm/Support/FormattedStream.h" diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp index 5e364e9ad50c..1d211a19acdd 100644 --- a/llvm/lib/MC/MCContext.cpp +++ b/llvm/lib/MC/MCContext.cpp @@ -42,7 +42,6 @@ #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MemoryBuffer.h" diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp index 275e76e37f9f..2881d7cfab4b 100644 --- a/llvm/lib/MC/MCELFStreamer.cpp +++ b/llvm/lib/MC/MCELFStreamer.cpp @@ -29,7 +29,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/LEB128.h" #include diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp index d274c88b1992..d505ac6dd4bf 100644 --- a/llvm/lib/MC/MCObjectFileInfo.cpp +++ b/llvm/lib/MC/MCObjectFileInfo.cpp @@ -24,7 +24,6 @@ #include "llvm/MC/MCSectionSPIRV.h" #include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCSectionXCOFF.h" -#include "llvm/Support/Casting.h" #include "llvm/TargetParser/Triple.h" using namespace llvm; diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp index 229b0b8d5389..1bb617b327f1 100644 --- a/llvm/lib/MC/MCParser/COFFMasmParser.cpp +++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp @@ -18,7 +18,6 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolCOFF.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/SMLoc.h" #include #include diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 6782c4b61863..513f3b3da781 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -22,7 +22,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/SectionKind.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/SMLoc.h" #include #include diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp index 6c2d2411bc28..ddfe1e10d9d0 100644 --- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp +++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSectionWasm.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbolWasm.h" -#include "llvm/Support/Casting.h" #include using namespace llvm; diff --git a/llvm/lib/MC/MCSymbol.cpp b/llvm/lib/MC/MCSymbol.cpp index 8192896eeb6b..b86873824cb0 100644 --- a/llvm/lib/MC/MCSymbol.cpp +++ b/llvm/lib/MC/MCSymbol.cpp @@ -20,6 +20,10 @@ using namespace llvm; +// There are numerous MCSymbol objects, so keeping sizeof(MCSymbol) small is +// crucial for minimizing peak memory usage. +static_assert(sizeof(MCSymbol) <= 24, "Keep the base symbol small"); + // Only the address of this fragment is ever actually used. static MCFragment SentinelFragment; @@ -44,13 +48,12 @@ void *MCSymbol::operator new(size_t s, const MCSymbolTableEntry *Name, } void MCSymbol::setVariableValue(const MCExpr *Value) { - assert(Value && "Invalid variable value!"); - assert((SymbolContents == SymContentsUnset || - SymbolContents == SymContentsVariable) && - "Cannot give common/offset symbol a variable value"); + assert(Value && "Invalid equated expression"); + assert((kind == Kind::Regular || kind == Kind::Equated) && + "Cannot equate a common symbol"); this->Value = Value; - SymbolContents = SymContentsVariable; - setUndefined(); + kind = Kind::Equated; + Fragment = nullptr; } void MCSymbol::print(raw_ostream &OS, const MCAsmInfo *MAI) const { diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp index 9c8b22480fb7..070b3d9f8d2c 100644 --- a/llvm/lib/MC/MCWasmStreamer.cpp +++ b/llvm/lib/MC/MCWasmStreamer.cpp @@ -22,7 +22,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolWasm.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" namespace llvm { diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp index a45936bebf0c..2e632deaf327 100644 --- a/llvm/lib/MC/MCWinCOFFStreamer.cpp +++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp @@ -30,7 +30,6 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SMLoc.h" diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp index a0e3dbaca3db..684e05a9be09 100644 --- a/llvm/lib/MC/MCXCOFFStreamer.cpp +++ b/llvm/lib/MC/MCXCOFFStreamer.cpp @@ -21,7 +21,6 @@ #include "llvm/MC/MCSymbolXCOFF.h" #include "llvm/MC/MCXCOFFObjectWriter.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" using namespace llvm; diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp index 13917ba53933..fce6b2ac5cf8 100644 --- a/llvm/lib/MC/XCOFFObjectWriter.cpp +++ b/llvm/lib/MC/XCOFFObjectWriter.cpp @@ -20,7 +20,6 @@ #include "llvm/MC/MCValue.h" #include "llvm/MC/MCXCOFFObjectWriter.h" #include "llvm/MC/StringTableBuilder.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/EndianStream.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" diff --git a/llvm/lib/Object/SFrameParser.cpp b/llvm/lib/Object/SFrameParser.cpp index 0c5638d776ef..759b579230d9 100644 --- a/llvm/lib/Object/SFrameParser.cpp +++ b/llvm/lib/Object/SFrameParser.cpp @@ -176,10 +176,10 @@ iterator_range::fre_iterator> SFrameParser::fres(const sframe::FuncDescEntry &FDE, Error &Err) const { uint64_t Offset = getFREBase() + FDE.StartFREOff; fre_iterator BeforeBegin = make_fallible_itr( - FallibleFREIterator(Data, FDE.getFREType(), -1, FDE.NumFREs, Offset), + FallibleFREIterator(Data, FDE.Info.getFREType(), -1, FDE.NumFREs, Offset), Err); fre_iterator End = make_fallible_end( - FallibleFREIterator(Data, FDE.getFREType(), FDE.NumFREs, FDE.NumFREs, + FallibleFREIterator(Data, FDE.Info.getFREType(), FDE.NumFREs, FDE.NumFREs, /*Offset=*/0)); return {++BeforeBegin, End}; } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index f810368a8494..b7edeea08276 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -124,7 +124,6 @@ #include "llvm/CodeGen/MachineCopyPropagation.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/CodeGen/MachineInstrBundle.h" #include "llvm/CodeGen/MachineLICM.h" #include "llvm/CodeGen/MachineLateInstrsCleanup.h" #include "llvm/CodeGen/MachinePassManager.h" diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp index df807fc02b91..a3473514d463 100644 --- a/llvm/lib/ProfileData/InstrProfWriter.cpp +++ b/llvm/lib/ProfileData/InstrProfWriter.cpp @@ -13,7 +13,6 @@ #include "llvm/ProfileData/InstrProfWriter.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/DataAccessProf.h" diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp index 1547f48bc7ac..0c0e1d042e75 100644 --- a/llvm/lib/Support/APInt.cpp +++ b/llvm/lib/Support/APInt.cpp @@ -1377,7 +1377,7 @@ static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r, // the true value, and a "borrow" to the left should be remembered. int64_t borrow = 0; for (unsigned i = 0; i < n; ++i) { - uint64_t p = uint64_t(qp) * uint64_t(v[i]); + uint64_t p = qp * uint64_t(v[i]); int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p); u[j+i] = Lo_32(subres); borrow = Hi_32(p) - Hi_32(subres); diff --git a/llvm/lib/Support/DXILABI.cpp b/llvm/lib/Support/DXILABI.cpp index 261fe1ef9827..082e32061bd4 100644 --- a/llvm/lib/Support/DXILABI.cpp +++ b/llvm/lib/Support/DXILABI.cpp @@ -15,20 +15,19 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/DXILABI.h" -#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/ErrorHandling.h" using namespace llvm; -static const EnumEntry ResourceClassNames[] = { - {"SRV", llvm::dxil::ResourceClass::SRV}, - {"UAV", llvm::dxil::ResourceClass::UAV}, - {"CBV", llvm::dxil::ResourceClass::CBuffer}, - {"Sampler", llvm::dxil::ResourceClass::Sampler}, -}; - -ArrayRef> dxil::getResourceClasses() { - return ArrayRef(ResourceClassNames); -} - StringRef dxil::getResourceClassName(dxil::ResourceClass RC) { - return enumToStringRef(RC, getResourceClasses()); + switch (RC) { + case dxil::ResourceClass::SRV: + return "SRV"; + case dxil::ResourceClass::UAV: + return "UAV"; + case dxil::ResourceClass::CBuffer: + return "CBV"; + case dxil::ResourceClass::Sampler: + return "Sampler"; + } + llvm_unreachable("Invalid ResourceClass enum value"); } diff --git a/llvm/lib/Support/SmallPtrSet.cpp b/llvm/lib/Support/SmallPtrSet.cpp index 0c226970906d..39fe1715d19b 100644 --- a/llvm/lib/Support/SmallPtrSet.cpp +++ b/llvm/lib/Support/SmallPtrSet.cpp @@ -13,6 +13,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/MemAlloc.h" #include @@ -28,7 +29,7 @@ void SmallPtrSetImplBase::shrink_and_clear() { // Reduce the number of buckets. unsigned Size = size(); CurArraySize = Size > 16 ? 1 << (Log2_32_Ceil(Size) + 1) : 32; - NumNonEmpty = NumTombstones = 0; + NumEntries = NumTombstones = 0; // Install the new array. Clear all the buckets to empty. CurArray = (const void**)safe_malloc(sizeof(void*) * CurArraySize); @@ -41,7 +42,8 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) { if (LLVM_UNLIKELY(size() * 4 >= CurArraySize * 3)) { // If more than 3/4 of the array is full, grow. Grow(CurArraySize < 64 ? 128 : CurArraySize * 2); - } else if (LLVM_UNLIKELY(CurArraySize - NumNonEmpty < CurArraySize / 8)) { + } else if (LLVM_UNLIKELY(CurArraySize - NumEntries - NumTombstones < + CurArraySize / 8)) { // If fewer of 1/8 of the array is empty (meaning that many are filled with // tombstones), rehash. Grow(CurArraySize); @@ -55,8 +57,7 @@ SmallPtrSetImplBase::insert_imp_big(const void *Ptr) { // Otherwise, insert it! if (*Bucket == getTombstoneMarker()) --NumTombstones; - else - ++NumNonEmpty; // Track density. + ++NumEntries; *Bucket = Ptr; incrementEpoch(); return std::make_pair(Bucket, true); @@ -130,7 +131,6 @@ void SmallPtrSetImplBase::Grow(unsigned NewSize) { if (!WasSmall) free(OldBuckets.begin()); - NumNonEmpty -= NumTombstones; NumTombstones = 0; IsSmall = false; } @@ -191,9 +191,9 @@ void SmallPtrSetImplBase::copyHelper(const SmallPtrSetImplBase &RHS) { CurArraySize = RHS.CurArraySize; // Copy over the contents from the other set - std::copy(RHS.CurArray, RHS.EndPointer(), CurArray); + llvm::copy(RHS.buckets(), CurArray); - NumNonEmpty = RHS.NumNonEmpty; + NumEntries = RHS.NumEntries; NumTombstones = RHS.NumTombstones; } @@ -215,7 +215,7 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage, if (RHS.isSmall()) { // Copy a small RHS rather than moving. CurArray = SmallStorage; - std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, CurArray); + llvm::copy(RHS.small_buckets(), CurArray); } else { CurArray = RHS.CurArray; RHS.CurArray = RHSSmallStorage; @@ -223,13 +223,13 @@ void SmallPtrSetImplBase::moveHelper(const void **SmallStorage, // Copy the rest of the trivial members. CurArraySize = RHS.CurArraySize; - NumNonEmpty = RHS.NumNonEmpty; + NumEntries = RHS.NumEntries; NumTombstones = RHS.NumTombstones; IsSmall = RHS.IsSmall; // Make the RHS small and empty. RHS.CurArraySize = SmallSize; - RHS.NumNonEmpty = 0; + RHS.NumEntries = 0; RHS.NumTombstones = 0; RHS.IsSmall = true; } @@ -243,7 +243,7 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage, if (!this->isSmall() && !RHS.isSmall()) { std::swap(this->CurArray, RHS.CurArray); std::swap(this->CurArraySize, RHS.CurArraySize); - std::swap(this->NumNonEmpty, RHS.NumNonEmpty); + std::swap(this->NumEntries, RHS.NumEntries); std::swap(this->NumTombstones, RHS.NumTombstones); return; } @@ -253,9 +253,9 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage, // If only RHS is small, copy the small elements into LHS and move the pointer // from LHS to RHS. if (!this->isSmall() && RHS.isSmall()) { - std::copy(RHS.CurArray, RHS.CurArray + RHS.NumNonEmpty, SmallStorage); + llvm::copy(RHS.small_buckets(), SmallStorage); std::swap(RHS.CurArraySize, this->CurArraySize); - std::swap(this->NumNonEmpty, RHS.NumNonEmpty); + std::swap(this->NumEntries, RHS.NumEntries); std::swap(this->NumTombstones, RHS.NumTombstones); RHS.CurArray = this->CurArray; RHS.IsSmall = false; @@ -267,10 +267,9 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage, // If only LHS is small, copy the small elements into RHS and move the pointer // from RHS to LHS. if (this->isSmall() && !RHS.isSmall()) { - std::copy(this->CurArray, this->CurArray + this->NumNonEmpty, - RHSSmallStorage); + llvm::copy(this->small_buckets(), RHSSmallStorage); std::swap(RHS.CurArraySize, this->CurArraySize); - std::swap(RHS.NumNonEmpty, this->NumNonEmpty); + std::swap(RHS.NumEntries, this->NumEntries); std::swap(RHS.NumTombstones, this->NumTombstones); this->CurArray = RHS.CurArray; this->IsSmall = false; @@ -281,16 +280,16 @@ void SmallPtrSetImplBase::swap(const void **SmallStorage, // Both a small, just swap the small elements. assert(this->isSmall() && RHS.isSmall()); - unsigned MinNonEmpty = std::min(this->NumNonEmpty, RHS.NumNonEmpty); - std::swap_ranges(this->CurArray, this->CurArray + MinNonEmpty, RHS.CurArray); - if (this->NumNonEmpty > MinNonEmpty) { - std::copy(this->CurArray + MinNonEmpty, this->CurArray + this->NumNonEmpty, - RHS.CurArray + MinNonEmpty); + unsigned MinEntries = std::min(this->NumEntries, RHS.NumEntries); + std::swap_ranges(this->CurArray, this->CurArray + MinEntries, RHS.CurArray); + if (this->NumEntries > MinEntries) { + std::copy(this->CurArray + MinEntries, this->CurArray + this->NumEntries, + RHS.CurArray + MinEntries); } else { - std::copy(RHS.CurArray + MinNonEmpty, RHS.CurArray + RHS.NumNonEmpty, - this->CurArray + MinNonEmpty); + std::copy(RHS.CurArray + MinEntries, RHS.CurArray + RHS.NumEntries, + this->CurArray + MinEntries); } assert(this->CurArraySize == RHS.CurArraySize); - std::swap(this->NumNonEmpty, RHS.NumNonEmpty); + std::swap(this->NumEntries, RHS.NumEntries); std::swap(this->NumTombstones, RHS.NumTombstones); } diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 885f2a94f85f..fddde668b7f1 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1475,24 +1475,26 @@ static bool requiresSaveVG(const MachineFunction &MF) { return true; } -bool isVGInstruction(MachineBasicBlock::iterator MBBI) { +static bool matchLibcall(const TargetLowering &TLI, const MachineOperand &MO, + RTLIB::Libcall LC) { + return MO.isSymbol() && + StringRef(TLI.getLibcallName(LC)) == MO.getSymbolName(); +} + +bool isVGInstruction(MachineBasicBlock::iterator MBBI, + const TargetLowering &TLI) { unsigned Opc = MBBI->getOpcode(); if (Opc == AArch64::CNTD_XPiI || Opc == AArch64::RDSVLI_XI || Opc == AArch64::UBFMXri) return true; - if (requiresGetVGCall(*MBBI->getMF())) { - if (Opc == AArch64::ORRXrr) - return true; + if (!requiresGetVGCall(*MBBI->getMF())) + return false; - if (Opc == AArch64::BL) { - auto Op1 = MBBI->getOperand(0); - return Op1.isSymbol() && - (StringRef(Op1.getSymbolName()) == "__arm_get_current_vg"); - } - } + if (Opc == AArch64::BL) + return matchLibcall(TLI, MBBI->getOperand(0), RTLIB::SMEABI_GET_CURRENT_VG); - return false; + return Opc == AArch64::ORRXrr; } // Convert callee-save register save/restore instruction to do stack pointer @@ -1511,9 +1513,11 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // functions, we need to do this for both the streaming and non-streaming // vector length. Move past these instructions if necessary. MachineFunction &MF = *MBB.getParent(); - if (requiresSaveVG(MF)) - while (isVGInstruction(MBBI)) + if (requiresSaveVG(MF)) { + auto &TLI = *MF.getSubtarget().getTargetLowering(); + while (isVGInstruction(MBBI, TLI)) ++MBBI; + } switch (MBBI->getOpcode()) { default: @@ -2097,11 +2101,12 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Move past the saves of the callee-saved registers, fixing up the offsets // and pre-inc if we decided to combine the callee-save and local stack // pointer bump above. + auto &TLI = *MF.getSubtarget().getTargetLowering(); while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && !IsSVECalleeSave(MBBI)) { if (CombineSPBump && // Only fix-up frame-setup load/store instructions. - (!requiresSaveVG(MF) || !isVGInstruction(MBBI))) + (!requiresSaveVG(MF) || !isVGInstruction(MBBI, TLI))) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); ++MBBI; @@ -3468,6 +3473,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, ArrayRef CSI, const TargetRegisterInfo *TRI) const { MachineFunction &MF = *MBB.getParent(); + auto &TLI = *MF.getSubtarget().getTargetLowering(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); AArch64FunctionInfo *AFI = MF.getInfo(); bool NeedsWinCFI = needsWinCFI(MF); @@ -3581,11 +3587,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( .addReg(AArch64::X0, RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); - const uint32_t *RegMask = TRI->getCallPreservedMask( - MF, - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1); + RTLIB::Libcall LC = RTLIB::SMEABI_GET_CURRENT_VG; + const uint32_t *RegMask = + TRI->getCallPreservedMask(MF, TLI.getLibcallCallingConv(LC)); BuildMI(MBB, MI, DL, TII.get(AArch64::BL)) - .addExternalSymbol("__arm_get_current_vg") + .addExternalSymbol(TLI.getLibcallName(LC)) .addRegMask(RegMask) .addReg(AArch64::X0, RegState::ImplicitDefine) .setMIFlag(MachineInstr::FrameSetup); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index aefbbe2534be..63a85faf344c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3083,13 +3083,12 @@ AArch64TargetLowering::EmitGetSMESaveSize(MachineInstr &MI, AArch64FunctionInfo *FuncInfo = MF->getInfo(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); if (FuncInfo->isSMESaveBufferUsed()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE_SIZE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) - .addExternalSymbol("__arm_sme_state_size") + .addExternalSymbol(getLibcallName(LC)) .addReg(AArch64::X0, RegState::ImplicitDefine) - .addRegMask(TRI->getCallPreservedMask( - *MF, CallingConv:: - AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1)); + .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) .addReg(AArch64::X0); @@ -3109,13 +3108,12 @@ AArch64TargetLowering::EmitEntryPStateSM(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); Register ResultReg = MI.getOperand(0).getReg(); if (FuncInfo->isPStateSMRegUsed()) { + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::BL)) - .addExternalSymbol("__arm_sme_state") + .addExternalSymbol(getLibcallName(LC)) .addReg(AArch64::X0, RegState::ImplicitDefine) - .addRegMask(TRI->getCallPreservedMask( - *MF, CallingConv:: - AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2)); + .addRegMask(TRI->getCallPreservedMask(*MF, getLibcallCallingConv(LC))); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), ResultReg) .addReg(AArch64::X0); } else { @@ -3520,6 +3518,13 @@ bool isLegalCmpImmed(APInt C) { return isLegalArithImmed(C.abs().getZExtValue()); } +unsigned numberOfInstrToLoadImm(APInt C) { + uint64_t Imm = C.getZExtValue(); + SmallVector Insn; + AArch64_IMM::expandMOVImm(Imm, 32, Insn); + return Insn.size(); +} + static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG) { // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe. if (Op->getFlags().hasNoSignedWrap()) @@ -3989,6 +3994,7 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, // CC has already been adjusted. RHS = DAG.getConstant(0, DL, VT); } else if (!isLegalCmpImmed(C)) { + unsigned NumImmForC = numberOfInstrToLoadImm(C); // Constant does not fit, try adjusting it by one? switch (CC) { default: @@ -3997,43 +4003,49 @@ static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, case ISD::SETGE: if (!C.isMinSignedValue()) { APInt CMinusOne = C - 1; - if (isLegalCmpImmed(CMinusOne)) { + if (isLegalCmpImmed(CMinusOne) || + (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(CMinusOne, DL, VT); } } break; case ISD::SETULT: - case ISD::SETUGE: - if (!C.isZero()) { - APInt CMinusOne = C - 1; - if (isLegalCmpImmed(CMinusOne)) { - CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; - RHS = DAG.getConstant(CMinusOne, DL, VT); - } + case ISD::SETUGE: { + // C is not 0 because it is a legal immediate. + assert(!C.isZero() && "C should not be zero here"); + APInt CMinusOne = C - 1; + if (isLegalCmpImmed(CMinusOne) || + (NumImmForC > numberOfInstrToLoadImm(CMinusOne))) { + CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; + RHS = DAG.getConstant(CMinusOne, DL, VT); } break; + } case ISD::SETLE: case ISD::SETGT: if (!C.isMaxSignedValue()) { APInt CPlusOne = C + 1; - if (isLegalCmpImmed(CPlusOne)) { + if (isLegalCmpImmed(CPlusOne) || + (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(CPlusOne, DL, VT); } } break; case ISD::SETULE: - case ISD::SETUGT: + case ISD::SETUGT: { if (!C.isAllOnes()) { APInt CPlusOne = C + 1; - if (isLegalCmpImmed(CPlusOne)) { + if (isLegalCmpImmed(CPlusOne) || + (NumImmForC > numberOfInstrToLoadImm(CPlusOne))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(CPlusOne, DL, VT); } } break; } + } } } @@ -5733,15 +5745,15 @@ static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::getRuntimePStateSM(SelectionDAG &DAG, SDValue Chain, SDLoc DL, EVT VT) const { - SDValue Callee = DAG.getExternalSymbol("__arm_sme_state", + RTLIB::Libcall LC = RTLIB::SMEABI_SME_STATE; + SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); Type *Int64Ty = Type::getInt64Ty(*DAG.getContext()); Type *RetTy = StructType::get(Int64Ty, Int64Ty); TargetLowering::CallLoweringInfo CLI(DAG); ArgListTy Args; CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2, - RetTy, Callee, std::move(Args)); + getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); std::pair CallResult = LowerCallTo(CLI); SDValue Mask = DAG.getConstant(/*PSTATE.SM*/ 1, DL, MVT::i64); return DAG.getNode(ISD::AND, DL, MVT::i64, CallResult.first.getOperand(0), @@ -8594,12 +8606,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI, } static SMECallAttrs -getSMECallAttrs(const Function &Caller, +getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI, const TargetLowering::CallLoweringInfo &CLI) { if (CLI.CB) - return SMECallAttrs(*CLI.CB); + return SMECallAttrs(*CLI.CB, &TLI); if (auto *ES = dyn_cast(CLI.Callee)) - return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol())); + return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI)); return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal)); } @@ -8621,7 +8633,7 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization( // SME Streaming functions are not eligible for TCO as they may require // the streaming mode or ZA to be restored after returning from the call. - SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, CLI); + SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingAllZAState() || CallAttrs.caller().hasStreamingBody()) @@ -8913,14 +8925,14 @@ static SDValue emitSMEStateSaveRestore(const AArch64TargetLowering &TLI, DAG.getCopyFromReg(Chain, DL, Info->getSMESaveBufferAddr(), MVT::i64), PointerType::getUnqual(*DAG.getContext())); - SDValue Callee = - DAG.getExternalSymbol(IsSave ? "__arm_sme_save" : "__arm_sme_restore", - TLI.getPointerTy(DAG.getDataLayout())); + RTLIB::Libcall LC = + IsSave ? RTLIB::SMEABI_SME_SAVE : RTLIB::SMEABI_SME_RESTORE; + SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC), + TLI.getPointerTy(DAG.getDataLayout())); auto *RetTy = Type::getVoidTy(*DAG.getContext()); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1, RetTy, - Callee, std::move(Args)); + TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)); return TLI.LowerCallTo(CLI).second; } @@ -9108,7 +9120,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } // Determine whether we need any streaming mode changes. - SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), CLI); + SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI); auto DescribeCallsite = [&](OptimizationRemarkAnalysis &R) -> OptimizationRemarkAnalysis & { @@ -9685,11 +9697,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_RESTORE; TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( - TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); + TRI->getCallPreservedMask(MF, getLibcallCallingConv(LC))); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( - "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout())); + getLibcallName(LC), getPointerTy(DAG.getDataLayout())); SDValue TPIDR2_EL0 = DAG.getNode( ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result, DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32)); @@ -29028,7 +29041,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const { // Checks to allow the use of SME instructions if (auto *Base = dyn_cast(&Inst)) { - auto CallAttrs = SMECallAttrs(*Base); + auto CallAttrs = SMECallAttrs(*Base, this); if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 456b21a70e90..178dab689739 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -7405,88 +7405,6 @@ multiclass SIMDLongThreeVectorHS opc, string asm, (extract_high_v4i32 (v4i32 V128:$Rm))))]>; } -let isCommutable = 1 in -multiclass SIMDLongThreeVectorBHSabdl opc, string asm, - SDPatternOperator OpNode = null_frag> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVector; - def v16i8_v8i16 : BaseSIMDDifferentThreeVector; - def v4i16_v4i32 : BaseSIMDDifferentThreeVector; - def v8i16_v4i32 : BaseSIMDDifferentThreeVector; - def v2i32_v2i64 : BaseSIMDDifferentThreeVector; - def v4i32_v2i64 : BaseSIMDDifferentThreeVector; -} - -multiclass SIMDLongThreeVectorTiedBHSabal opc, - string asm, - SDPatternOperator OpNode> { - def v8i8_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v16i8_v8i16 : BaseSIMDDifferentThreeVectorTied; - def v4i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v8i16_v4i32 : BaseSIMDDifferentThreeVectorTied; - def v2i32_v2i64 : BaseSIMDDifferentThreeVectorTied; - def v4i32_v2i64 : BaseSIMDDifferentThreeVectorTied; -} - let isCommutable = 1 in multiclass SIMDLongThreeVectorBHS opc, string asm, SDPatternOperator OpNode = null_frag> { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index a55f103bff38..6a8e7a472bf5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -20,7 +20,9 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/CFIInstBuilder.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -83,6 +85,11 @@ static cl::opt BDisplacementBits("aarch64-b-offset-bits", cl::Hidden, cl::init(26), cl::desc("Restrict range of B instructions (DEBUG)")); +static cl::opt GatherOptSearchLimit( + "aarch64-search-limit", cl::Hidden, cl::init(2048), + cl::desc("Restrict range of instructions to search for the " + "machine-combiner gather pattern optimization")); + AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI) : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP, AArch64::CATCHRET), @@ -7412,11 +7419,319 @@ static bool getMiscPatterns(MachineInstr &Root, return false; } +/// Check if the given instruction forms a gather load pattern that can be +/// optimized for better Memory-Level Parallelism (MLP). This function +/// identifies chains of NEON lane load instructions that load data from +/// different memory addresses into individual lanes of a 128-bit vector +/// register, then attempts to split the pattern into parallel loads to break +/// the serial dependency between instructions. +/// +/// Pattern Matched: +/// Initial scalar load -> SUBREG_TO_REG (lane 0) -> LD1i* (lane 1) -> +/// LD1i* (lane 2) -> ... -> LD1i* (lane N-1, Root) +/// +/// Transformed Into: +/// Two parallel vector loads using fewer lanes each, followed by ZIP1v2i64 +/// to combine the results, enabling better memory-level parallelism. +/// +/// Supported Element Types: +/// - 32-bit elements (LD1i32, 4 lanes total) +/// - 16-bit elements (LD1i16, 8 lanes total) +/// - 8-bit elements (LD1i8, 16 lanes total) +static bool getGatherLanePattern(MachineInstr &Root, + SmallVectorImpl &Patterns, + unsigned LoadLaneOpCode, unsigned NumLanes) { + const MachineFunction *MF = Root.getMF(); + + // Early exit if optimizing for size. + if (MF->getFunction().hasMinSize()) + return false; + + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + + // The root of the pattern must load into the last lane of the vector. + if (Root.getOperand(2).getImm() != NumLanes - 1) + return false; + + // Check that we have load into all lanes except lane 0. + // For each load we also want to check that: + // 1. It has a single non-debug use (since we will be replacing the virtual + // register) + // 2. That the addressing mode only uses a single pointer operand + auto *CurrInstr = MRI.getUniqueVRegDef(Root.getOperand(1).getReg()); + auto Range = llvm::seq(1, NumLanes - 1); + SmallSet RemainingLanes(Range.begin(), Range.end()); + SmallVector LoadInstrs; + while (!RemainingLanes.empty() && CurrInstr && + CurrInstr->getOpcode() == LoadLaneOpCode && + MRI.hasOneNonDBGUse(CurrInstr->getOperand(0).getReg()) && + CurrInstr->getNumOperands() == 4) { + RemainingLanes.erase(CurrInstr->getOperand(2).getImm()); + LoadInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Check that we have found a match for lanes N-1.. 1. + if (!RemainingLanes.empty()) + return false; + + // Match the SUBREG_TO_REG sequence. + if (CurrInstr->getOpcode() != TargetOpcode::SUBREG_TO_REG) + return false; + + // Verify that the subreg to reg loads an integer into the first lane. + auto Lane0LoadReg = CurrInstr->getOperand(2).getReg(); + unsigned SingleLaneSizeInBits = 128 / NumLanes; + if (TRI->getRegSizeInBits(Lane0LoadReg, MRI) != SingleLaneSizeInBits) + return false; + + // Verify that it also has a single non debug use. + if (!MRI.hasOneNonDBGUse(Lane0LoadReg)) + return false; + + LoadInstrs.push_back(MRI.getUniqueVRegDef(Lane0LoadReg)); + + // If there is any chance of aliasing, do not apply the pattern. + // Walk backward through the MBB starting from Root. + // Exit early if we've encountered all load instructions or hit the search + // limit. + auto MBBItr = Root.getIterator(); + unsigned RemainingSteps = GatherOptSearchLimit; + SmallSet RemainingLoadInstrs; + RemainingLoadInstrs.insert(LoadInstrs.begin(), LoadInstrs.end()); + const MachineBasicBlock *MBB = Root.getParent(); + + for (; MBBItr != MBB->begin() && RemainingSteps > 0 && + !RemainingLoadInstrs.empty(); + --MBBItr, --RemainingSteps) { + const MachineInstr &CurrInstr = *MBBItr; + + // Remove this instruction from remaining loads if it's one we're tracking. + RemainingLoadInstrs.erase(&CurrInstr); + + // Check for potential aliasing with any of the load instructions to + // optimize. + if (CurrInstr.isLoadFoldBarrier()) + return false; + } + + // If we hit the search limit without finding all load instructions, + // don't match the pattern. + if (RemainingSteps == 0 && !RemainingLoadInstrs.empty()) + return false; + + switch (NumLanes) { + case 4: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i32); + break; + case 8: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i16); + break; + case 16: + Patterns.push_back(AArch64MachineCombinerPattern::GATHER_LANE_i8); + break; + default: + llvm_unreachable("Got bad number of lanes for gather pattern."); + } + + return true; +} + +/// Search for patterns of LD instructions we can optimize. +static bool getLoadPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + + // The pattern searches for loads into single lanes. + switch (Root.getOpcode()) { + case AArch64::LD1i32: + return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 4); + case AArch64::LD1i16: + return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 8); + case AArch64::LD1i8: + return getGatherLanePattern(Root, Patterns, Root.getOpcode(), 16); + default: + return false; + } +} + +/// Generate optimized instruction sequence for gather load patterns to improve +/// Memory-Level Parallelism (MLP). This function transforms a chain of +/// sequential NEON lane loads into parallel vector loads that can execute +/// concurrently. +static void +generateGatherLanePattern(MachineInstr &Root, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg, + unsigned Pattern, unsigned NumLanes) { + MachineFunction &MF = *Root.getParent()->getParent(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + + // Gather the initial load instructions to build the pattern. + SmallVector LoadToLaneInstrs; + MachineInstr *CurrInstr = &Root; + for (unsigned i = 0; i < NumLanes - 1; ++i) { + LoadToLaneInstrs.push_back(CurrInstr); + CurrInstr = MRI.getUniqueVRegDef(CurrInstr->getOperand(1).getReg()); + } + + // Sort the load instructions according to the lane. + llvm::sort(LoadToLaneInstrs, + [](const MachineInstr *A, const MachineInstr *B) { + return A->getOperand(2).getImm() > B->getOperand(2).getImm(); + }); + + MachineInstr *SubregToReg = CurrInstr; + LoadToLaneInstrs.push_back( + MRI.getUniqueVRegDef(SubregToReg->getOperand(2).getReg())); + auto LoadToLaneInstrsAscending = llvm::reverse(LoadToLaneInstrs); + + const TargetRegisterClass *FPR128RegClass = + MRI.getRegClass(Root.getOperand(0).getReg()); + + // Helper lambda to create a LD1 instruction. + auto CreateLD1Instruction = [&](MachineInstr *OriginalInstr, + Register SrcRegister, unsigned Lane, + Register OffsetRegister, + bool OffsetRegisterKillState) { + auto NewRegister = MRI.createVirtualRegister(FPR128RegClass); + MachineInstrBuilder LoadIndexIntoRegister = + BuildMI(MF, MIMetadata(*OriginalInstr), TII->get(Root.getOpcode()), + NewRegister) + .addReg(SrcRegister) + .addImm(Lane) + .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState)); + InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size())); + InsInstrs.push_back(LoadIndexIntoRegister); + return NewRegister; + }; + + // Helper to create load instruction based on the NumLanes in the NEON + // register we are rewriting. + auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg, + Register OffsetReg, + bool KillState) -> MachineInstrBuilder { + unsigned Opcode; + switch (NumLanes) { + case 4: + Opcode = AArch64::LDRSui; + break; + case 8: + Opcode = AArch64::LDRHui; + break; + case 16: + Opcode = AArch64::LDRBui; + break; + default: + llvm_unreachable( + "Got unsupported number of lanes in machine-combiner gather pattern"); + } + // Immediate offset load + return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg) + .addReg(OffsetReg) + .addImm(0); + }; + + // Load the remaining lanes into register 0. + auto LanesToLoadToReg0 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + 1, + LoadToLaneInstrsAscending.begin() + NumLanes / 2); + Register PrevReg = SubregToReg->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg0)) { + const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3); + PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1, + OffsetRegOperand.getReg(), + OffsetRegOperand.isKill()); + DelInstrs.push_back(LoadInstr); + } + Register LastLoadReg0 = PrevReg; + + // First load into register 1. Perform an integer load to zero out the upper + // lanes in a single instruction. + MachineInstr *Lane0Load = *LoadToLaneInstrsAscending.begin(); + MachineInstr *OriginalSplitLoad = + *std::next(LoadToLaneInstrsAscending.begin(), NumLanes / 2); + Register DestRegForMiddleIndex = MRI.createVirtualRegister( + MRI.getRegClass(Lane0Load->getOperand(0).getReg())); + + const MachineOperand &OriginalSplitToLoadOffsetOperand = + OriginalSplitLoad->getOperand(3); + MachineInstrBuilder MiddleIndexLoadInstr = + CreateLDRInstruction(NumLanes, DestRegForMiddleIndex, + OriginalSplitToLoadOffsetOperand.getReg(), + OriginalSplitToLoadOffsetOperand.isKill()); + + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForMiddleIndex, InsInstrs.size())); + InsInstrs.push_back(MiddleIndexLoadInstr); + DelInstrs.push_back(OriginalSplitLoad); + + // Subreg To Reg instruction for register 1. + Register DestRegForSubregToReg = MRI.createVirtualRegister(FPR128RegClass); + unsigned SubregType; + switch (NumLanes) { + case 4: + SubregType = AArch64::ssub; + break; + case 8: + SubregType = AArch64::hsub; + break; + case 16: + SubregType = AArch64::bsub; + break; + default: + llvm_unreachable( + "Got invalid NumLanes for machine-combiner gather pattern"); + } + + auto SubRegToRegInstr = + BuildMI(MF, MIMetadata(Root), TII->get(SubregToReg->getOpcode()), + DestRegForSubregToReg) + .addImm(0) + .addReg(DestRegForMiddleIndex, getKillRegState(true)) + .addImm(SubregType); + InstrIdxForVirtReg.insert( + std::make_pair(DestRegForSubregToReg, InsInstrs.size())); + InsInstrs.push_back(SubRegToRegInstr); + + // Load remaining lanes into register 1. + auto LanesToLoadToReg1 = + llvm::make_range(LoadToLaneInstrsAscending.begin() + NumLanes / 2 + 1, + LoadToLaneInstrsAscending.end()); + PrevReg = SubRegToRegInstr->getOperand(0).getReg(); + for (auto [Index, LoadInstr] : llvm::enumerate(LanesToLoadToReg1)) { + const MachineOperand &OffsetRegOperand = LoadInstr->getOperand(3); + PrevReg = CreateLD1Instruction(LoadInstr, PrevReg, Index + 1, + OffsetRegOperand.getReg(), + OffsetRegOperand.isKill()); + + // Do not add the last reg to DelInstrs - it will be removed later. + if (Index == NumLanes / 2 - 2) { + break; + } + DelInstrs.push_back(LoadInstr); + } + Register LastLoadReg1 = PrevReg; + + // Create the final zip instruction to combine the results. + MachineInstrBuilder ZipInstr = + BuildMI(MF, MIMetadata(Root), TII->get(AArch64::ZIP1v2i64), + Root.getOperand(0).getReg()) + .addReg(LastLoadReg0) + .addReg(LastLoadReg1); + InsInstrs.push_back(ZipInstr); +} + CombinerObjective AArch64InstrInfo::getCombinerObjective(unsigned Pattern) const { switch (Pattern) { case AArch64MachineCombinerPattern::SUBADD_OP1: case AArch64MachineCombinerPattern::SUBADD_OP2: + case AArch64MachineCombinerPattern::GATHER_LANE_i32: + case AArch64MachineCombinerPattern::GATHER_LANE_i16: + case AArch64MachineCombinerPattern::GATHER_LANE_i8: return CombinerObjective::MustReduceDepth; default: return TargetInstrInfo::getCombinerObjective(Pattern); @@ -7446,6 +7761,10 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( if (getMiscPatterns(Root, Patterns)) return true; + // Load patterns + if (getLoadPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, DoRegPressureReduce); } @@ -8701,6 +9020,21 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFNegatedMAD(MF, MRI, TII, Root, InsInstrs); break; } + case AArch64MachineCombinerPattern::GATHER_LANE_i32: { + generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 4); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i16: { + generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 8); + break; + } + case AArch64MachineCombinerPattern::GATHER_LANE_i8: { + generateGatherLanePattern(Root, InsInstrs, DelInstrs, InstrIdxForVirtReg, + Pattern, 16); + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index b903cd90c1e7..70c814a3a48c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,10 @@ enum AArch64MachineCombinerPattern : unsigned { FMULv8i16_indexed_OP2, FNMADD, + + GATHER_LANE_i32, + GATHER_LANE_i16, + GATHER_LANE_i8 }; class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 8cfbff938a39..4fa91a4dc827 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5707,27 +5707,6 @@ let Predicates = [HasFullFP16] in { // Advanced SIMD two vector instructions. //===----------------------------------------------------------------------===// -defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl", abdu>; -// Match UABDL in log2-shuffle patterns. -def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)), - (zext (v8i8 V64:$opB))))), - (UABDLv8i8_v8i16 V64:$opA, V64:$opB)>; -def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 (v16i8 V128:$opA))), - (zext (extract_high_v16i8 (v16i8 V128:$opB)))))), - (UABDLv16i8_v8i16 V128:$opA, V128:$opB)>; -def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)), - (zext (v4i16 V64:$opB))))), - (UABDLv4i16_v4i32 V64:$opA, V64:$opB)>; -def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 (v8i16 V128:$opA))), - (zext (extract_high_v8i16 (v8i16 V128:$opB)))))), - (UABDLv8i16_v4i32 V128:$opA, V128:$opB)>; -def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)), - (zext (v2i32 V64:$opB))))), - (UABDLv2i32_v2i64 V64:$opA, V64:$opB)>; -def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 (v4i32 V128:$opA))), - (zext (extract_high_v4i32 (v4i32 V128:$opB)))))), - (UABDLv4i32_v2i64 V128:$opA, V128:$opB)>; - defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>; defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>; defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>; @@ -6803,41 +6782,45 @@ def : Pat <(f64 (uint_to_fp (i32 // Advanced SIMD three different-sized vector instructions. //===----------------------------------------------------------------------===// -defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>; -defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; -defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; -defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; +defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>; +defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>; +defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>; +defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>; let isCommutable = 1 in -defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; -defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal", abds>; -defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl", abds>; +defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull", AArch64pmull>; +defm SABAL : SIMDLongThreeVectorTiedBHS<0,0b0101,"sabal", + TriOpFrag<(add node:$LHS, (zext (abds node:$MHS, node:$RHS)))>>; +defm SABDL : SIMDLongThreeVectorBHS<0, 0b0111, "sabdl", + BinOpFrag<(zext (abds node:$LHS, node:$RHS))>>; defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl", - BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; + BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>; defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw", BinOpFrag<(add node:$LHS, (sext node:$RHS))>>; defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal", - TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl", - TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; + TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>; defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", AArch64smull>; defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal", saddsat>; defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl", ssubsat>; -defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", - int_aarch64_neon_sqdmull>; +defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull", int_aarch64_neon_sqdmull>; let isCommutable = 0 in defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl", BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>; defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw", BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>; -defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal", abdu>; +defm UABAL : SIMDLongThreeVectorTiedBHS<1, 0b0101, "uabal", + TriOpFrag<(add node:$LHS, (zext (abdu node:$MHS, node:$RHS)))>>; +defm UABDL : SIMDLongThreeVectorBHS<1, 0b0111, "uabdl", + BinOpFrag<(zext (abdu node:$LHS, node:$RHS))>>; defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl", BinOpFrag<(add (zanyext node:$LHS), (zanyext node:$RHS))>>; defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw", BinOpFrag<(add node:$LHS, (zanyext node:$RHS))>>; defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal", - TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; + TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl", - TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; + TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>>; defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", AArch64umull>; let isCommutable = 0 in defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl", diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3042251cf754..17f0028e43fc 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -220,20 +220,17 @@ static cl::opt EnableFixedwidthAutovecInStreamingMode( static cl::opt EnableScalableAutovecInStreamingMode( "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden); -static bool isSMEABIRoutineCall(const CallInst &CI) { +static bool isSMEABIRoutineCall(const CallInst &CI, + const AArch64TargetLowering &TLI) { const auto *F = CI.getCalledFunction(); - return F && StringSwitch(F->getName()) - .Case("__arm_sme_state", true) - .Case("__arm_tpidr2_save", true) - .Case("__arm_tpidr2_restore", true) - .Case("__arm_za_disable", true) - .Default(false); + return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine(); } /// Returns true if the function has explicit operations that can only be /// lowered using incompatible instructions for the selected mode. This also /// returns true if the function F may use or modify ZA state. -static bool hasPossibleIncompatibleOps(const Function *F) { +static bool hasPossibleIncompatibleOps(const Function *F, + const AArch64TargetLowering &TLI) { for (const BasicBlock &BB : *F) { for (const Instruction &I : BB) { // Be conservative for now and assume that any call to inline asm or to @@ -242,7 +239,7 @@ static bool hasPossibleIncompatibleOps(const Function *F) { // all native LLVM instructions can be lowered to compatible instructions. if (isa(I) && !I.isDebugOrPseudoInst() && (cast(I).isInlineAsm() || isa(I) || - isSMEABIRoutineCall(cast(I)))) + isSMEABIRoutineCall(cast(I), TLI))) return true; } } @@ -290,7 +287,7 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() || CallAttrs.requiresPreservingZT0() || CallAttrs.requiresPreservingAllZAState()) { - if (hasPossibleIncompatibleOps(Callee)) + if (hasPossibleIncompatibleOps(Callee, *getTLI())) return false; } @@ -357,7 +354,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, // change only once and avoid inlining of G into F. SMEAttrs FAttrs(*F); - SMECallAttrs CallAttrs(Call); + SMECallAttrs CallAttrs(Call, getTLI()); if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) { if (F == Call.getCaller()) // (1) @@ -4912,13 +4909,35 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // load/store dependencies, to expose more parallel memory access streams, // or if they do little work inside a block (i.e. load -> X -> store pattern). BasicBlock *Header = L->getHeader(); - if (Header == L->getLoopLatch()) { + BasicBlock *Latch = L->getLoopLatch(); + if (Header == Latch) { // Estimate the size of the loop. unsigned Size; unsigned Width = 10; if (!isLoopSizeWithinBudget(L, TTI, Width, &Size)) return; + // Try to find an unroll count that maximizes the use of the instruction + // window, i.e. trying to fetch as many instructions per cycle as possible. + unsigned MaxInstsPerLine = 16; + unsigned UC = 1; + unsigned BestUC = 1; + unsigned SizeWithBestUC = BestUC * Size; + while (UC <= 8) { + unsigned SizeWithUC = UC * Size; + if (SizeWithUC > 48) + break; + if ((SizeWithUC % MaxInstsPerLine) == 0 || + (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { + BestUC = UC; + SizeWithBestUC = BestUC * Size; + } + UC++; + } + + if (BestUC == 1) + return; + SmallPtrSet LoadedValuesPlus; SmallVector Stores; for (auto *BB : L->blocks()) { @@ -4940,25 +4959,7 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, } } - // Try to find an unroll count that maximizes the use of the instruction - // window, i.e. trying to fetch as many instructions per cycle as possible. - unsigned MaxInstsPerLine = 16; - unsigned UC = 1; - unsigned BestUC = 1; - unsigned SizeWithBestUC = BestUC * Size; - while (UC <= 8) { - unsigned SizeWithUC = UC * Size; - if (SizeWithUC > 48) - break; - if ((SizeWithUC % MaxInstsPerLine) == 0 || - (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) { - BestUC = UC; - SizeWithBestUC = BestUC * Size; - } - UC++; - } - - if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) { + if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) { return LoadedValuesPlus.contains(SI->getOperand(0)); })) return; @@ -4971,7 +4972,6 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, // Try to runtime-unroll loops with early-continues depending on loop-varying // loads; this helps with branch-prediction for the early-continues. auto *Term = dyn_cast(Header->getTerminator()); - auto *Latch = L->getLoopLatch(); SmallVector Preds(predecessors(Latch)); if (!Term || !Term->isConditional() || Preds.size() == 1 || !llvm::is_contained(Preds, Header) || diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 3ba08c8c1d98..6025f1c9f5f4 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -614,8 +614,7 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, // x uge c => x ugt c - 1 // // When c is not zero. - if (C == 0) - return std::nullopt; + assert(C != 0 && "C should not be zero here!"); P = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT; C -= 1; break; @@ -656,14 +655,13 @@ tryAdjustICmpImmAndPred(Register RHS, CmpInst::Predicate P, if (isLegalArithImmed(C)) return {{C, P}}; - auto IsMaterializableInSingleInstruction = [=](uint64_t Imm) { + auto NumberOfInstrToLoadImm = [=](uint64_t Imm) { SmallVector Insn; AArch64_IMM::expandMOVImm(Imm, 32, Insn); - return Insn.size() == 1; + return Insn.size(); }; - if (!IsMaterializableInSingleInstruction(OriginalC) && - IsMaterializableInSingleInstruction(C)) + if (NumberOfInstrToLoadImm(OriginalC) > NumberOfInstrToLoadImm(C)) return {{C, P}}; return std::nullopt; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp index 14547e3c4580..917dbdfbbc9e 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp @@ -35,7 +35,6 @@ #include "llvm/MC/MCTargetOptions.h" #include "llvm/MC/MCWinCOFFStreamer.h" #include "llvm/Support/AArch64BuildAttributes.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp index 3c8b5712c1f0..54b58e948daf 100644 --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp @@ -1017,14 +1017,22 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI, else return false; + StringRef Reg = getRegisterName(MI->getOperand(4).getReg()); + bool NotXZR = Reg != "xzr"; + + // If a mandatory is not specified in the TableGen + // (i.e. no register operand should be present), and the register value + // is not xzr/x31, then disassemble to a SYS alias instead. + if (NotXZR && !NeedsReg) + return false; + std::string Str = Ins + Name; llvm::transform(Str, Str.begin(), ::tolower); O << '\t' << Str; - if (NeedsReg) { - O << ", "; - printRegName(O, MI->getOperand(4).getReg()); - } + + if (NeedsReg) + O << ", " << Reg; return true; } diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 4af4d4930662..2008516885c3 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -15,11 +15,16 @@ #include "AArch64.h" #include "Utils/AArch64SMEAttributes.h" #include "llvm/ADT/StringRef.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" +#include "llvm/IR/RuntimeLibcalls.h" +#include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" using namespace llvm; @@ -33,9 +38,13 @@ struct SMEABI : public FunctionPass { bool runOnFunction(Function &F) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + private: bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder, - SMEAttrs FnAttrs); + SMEAttrs FnAttrs, const TargetLowering &TLI); }; } // end anonymous namespace @@ -51,14 +60,16 @@ FunctionPass *llvm::createSMEABIPass() { return new SMEABI(); } //===----------------------------------------------------------------------===// // Utility function to emit a call to __arm_tpidr2_save and clear TPIDR2_EL0. -void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { +void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, const TargetLowering &TLI, + bool ZT0IsUndef = false) { auto &Ctx = M->getContext(); auto *TPIDR2SaveTy = FunctionType::get(Builder.getVoidTy(), {}, /*IsVarArgs=*/false); auto Attrs = AttributeList().addFnAttribute(Ctx, "aarch64_pstate_sm_compatible"); + RTLIB::Libcall LC = RTLIB::SMEABI_TPIDR2_SAVE; FunctionCallee Callee = - M->getOrInsertFunction("__arm_tpidr2_save", TPIDR2SaveTy, Attrs); + M->getOrInsertFunction(TLI.getLibcallName(LC), TPIDR2SaveTy, Attrs); CallInst *Call = Builder.CreateCall(Callee); // If ZT0 is undefined (i.e. we're at the entry of a "new_zt0" function), mark @@ -67,8 +78,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { if (ZT0IsUndef) Call->addFnAttr(Attribute::get(Ctx, "aarch64_zt0_undef")); - Call->setCallingConv( - CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0); + Call->setCallingConv(TLI.getLibcallCallingConv(LC)); // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. Function *WriteIntr = @@ -98,7 +108,8 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder, bool ZT0IsUndef = false) { /// interface if it does not share ZA or ZT0. /// bool SMEABI::updateNewStateFunctions(Module *M, Function *F, - IRBuilder<> &Builder, SMEAttrs FnAttrs) { + IRBuilder<> &Builder, SMEAttrs FnAttrs, + const TargetLowering &TLI) { LLVMContext &Context = F->getContext(); BasicBlock *OrigBB = &F->getEntryBlock(); Builder.SetInsertPoint(&OrigBB->front()); @@ -124,7 +135,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Create a call __arm_tpidr2_save, which commits the lazy save. Builder.SetInsertPoint(&SaveBB->back()); - emitTPIDR2Save(M, Builder, /*ZT0IsUndef=*/FnAttrs.isNewZT0()); + emitTPIDR2Save(M, Builder, TLI, /*ZT0IsUndef=*/FnAttrs.isNewZT0()); // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); @@ -172,10 +183,14 @@ bool SMEABI::runOnFunction(Function &F) { if (F.isDeclaration() || F.hasFnAttribute("aarch64_expanded_pstate_za")) return false; + const TargetMachine &TM = + getAnalysis().getTM(); + const TargetLowering &TLI = *TM.getSubtargetImpl(F)->getTargetLowering(); + bool Changed = false; SMEAttrs FnAttrs(F); if (FnAttrs.isNewZA() || FnAttrs.isNewZT0()) - Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs); + Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs, TLI); return Changed; } diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp index 271094f935e0..dd6fa167c6f4 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp @@ -7,17 +7,14 @@ //===----------------------------------------------------------------------===// #include "AArch64SMEAttributes.h" +#include "AArch64ISelLowering.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/RuntimeLibcalls.h" #include using namespace llvm; -void SMEAttrs::set(unsigned M, bool Enable) { - if (Enable) - Bitmask |= M; - else - Bitmask &= ~M; - +void SMEAttrs::validate() const { // Streaming Mode Attrs assert(!(hasStreamingInterface() && hasStreamingCompatibleInterface()) && "SM_Enabled and SM_Compatible are mutually exclusive"); @@ -77,19 +74,36 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) { Bitmask |= encodeZT0State(StateValue::New); } -void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName) { +void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName, + const AArch64TargetLowering &TLI) { + RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName); + if (Impl == RTLIB::Unsupported) + return; unsigned KnownAttrs = SMEAttrs::Normal; - if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state") - KnownAttrs |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine); - if (FuncName == "__arm_tpidr2_restore") + RTLIB::Libcall LC = RTLIB::RuntimeLibcallsInfo::getLibcallFromImpl(Impl); + switch (LC) { + case RTLIB::SMEABI_SME_STATE: + case RTLIB::SMEABI_TPIDR2_SAVE: + case RTLIB::SMEABI_GET_CURRENT_VG: + case RTLIB::SMEABI_SME_STATE_SIZE: + case RTLIB::SMEABI_SME_SAVE: + case RTLIB::SMEABI_SME_RESTORE: + KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; + break; + case RTLIB::SMEABI_ZA_DISABLE: + case RTLIB::SMEABI_TPIDR2_RESTORE: KnownAttrs |= SMEAttrs::SM_Compatible | encodeZAState(StateValue::In) | SMEAttrs::SME_ABI_Routine; - if (FuncName == "__arm_sc_memcpy" || FuncName == "__arm_sc_memset" || - FuncName == "__arm_sc_memmove" || FuncName == "__arm_sc_memchr") + break; + case RTLIB::SC_MEMCPY: + case RTLIB::SC_MEMMOVE: + case RTLIB::SC_MEMSET: + case RTLIB::SC_MEMCHR: KnownAttrs |= SMEAttrs::SM_Compatible; - if (FuncName == "__arm_sme_save" || FuncName == "__arm_sme_restore" || - FuncName == "__arm_sme_state_size") - KnownAttrs |= SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine; + break; + default: + break; + } set(KnownAttrs); } @@ -110,11 +124,11 @@ bool SMECallAttrs::requiresSMChange() const { return true; } -SMECallAttrs::SMECallAttrs(const CallBase &CB) +SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI) : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal), Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) { if (auto *CalledFunction = CB.getCalledFunction()) - CalledFn = SMEAttrs(*CalledFunction, SMEAttrs::InferAttrsFromName::Yes); + CalledFn = SMEAttrs(*CalledFunction, TLI); // FIXME: We probably should not allow SME attributes on direct calls but // clang duplicates streaming mode attributes at each callsite. diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h index f1be0ecbee7e..48f9da02d318 100644 --- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h +++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h @@ -13,6 +13,8 @@ namespace llvm { +class AArch64TargetLowering; + class Function; class CallBase; class AttributeList; @@ -48,19 +50,27 @@ public: CallSiteFlags_Mask = ZT0_Undef }; - enum class InferAttrsFromName { No, Yes }; - SMEAttrs() = default; SMEAttrs(unsigned Mask) { set(Mask); } - SMEAttrs(const Function &F, InferAttrsFromName Infer = InferAttrsFromName::No) + SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr) : SMEAttrs(F.getAttributes()) { - if (Infer == InferAttrsFromName::Yes) - addKnownFunctionAttrs(F.getName()); + if (TLI) + addKnownFunctionAttrs(F.getName(), *TLI); } SMEAttrs(const AttributeList &L); - SMEAttrs(StringRef FuncName) { addKnownFunctionAttrs(FuncName); }; + SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) { + addKnownFunctionAttrs(FuncName, TLI); + }; - void set(unsigned M, bool Enable = true); + void set(unsigned M, bool Enable = true) { + if (Enable) + Bitmask |= M; + else + Bitmask &= ~M; +#ifndef NDEBUG + validate(); +#endif + } // Interfaces to query PSTATE.SM bool hasStreamingBody() const { return Bitmask & SM_Body; } @@ -146,7 +156,9 @@ public: } private: - void addKnownFunctionAttrs(StringRef FuncName); + void addKnownFunctionAttrs(StringRef FuncName, + const AArch64TargetLowering &TLI); + void validate() const; }; /// SMECallAttrs is a utility class to hold the SMEAttrs for a callsite. It has @@ -163,7 +175,7 @@ public: SMEAttrs Callsite = SMEAttrs::Normal) : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {} - SMECallAttrs(const CallBase &CB); + SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI); SMEAttrs &caller() { return CallerFn; } SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6a02995fc9cb..04c4d000547a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1196,18 +1196,25 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) { void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) { SDLoc SL(N); bool Signed = N->getOpcode() == ISD::SMUL_LOHI; + SDVTList VTList; unsigned Opc; - if (Subtarget->hasMADIntraFwdBug()) - Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 - : AMDGPU::V_MAD_U64_U32_gfx11_e64; - else - Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + if (Subtarget->hasMadU64U32NoCarry()) { + VTList = CurDAG->getVTList(MVT::i64); + Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64; + } else { + VTList = CurDAG->getVTList(MVT::i64, MVT::i1); + if (Subtarget->hasMADIntraFwdBug()) { + Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64 + : AMDGPU::V_MAD_U64_U32_gfx11_e64; + } else { + Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64; + } + } SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64); SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp}; - SDNode *Mad = CurDAG->getMachineNode( - Opc, SL, CurDAG->getVTList(MVT::i64, MVT::i1), Ops); + SDNode *Mad = CurDAG->getMachineNode(Opc, SL, VTList, Ops); if (!SDValue(N, 0).use_empty()) { SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32); SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b88891ac4894..600a13096f55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" @@ -137,6 +138,14 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +// Retrieves the scalar type that's the same size as the mem desc +static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + return std::make_pair(TypeIdx, LLT::scalar(MemSize)); + }; +} + // Increase the number of vector elements to reach the next legal RegClass. static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { @@ -384,6 +393,16 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) { }; } +// If we have a truncating store or an extending load with a data size larger +// than 32-bits and mem location is a power of 2 +static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits(); + return isWideScalarExtLoadTruncStore(TypeIdx)(Query) && + isPowerOf2_64(MemSize); + }; +} + // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we // handle some operations by just promoting the register during // selection. There are also d16 loads on GFX9+ which preserve the high bits. @@ -1635,11 +1654,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // May need relegalization for the scalars. return std::pair(0, EltTy); }) - .minScalar(0, S32) - .narrowScalarIf(isWideScalarExtLoadTruncStore(0), changeTo(0, S32)) - .widenScalarToNextPow2(0) - .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) - .lower(); + .minScalar(0, S32) + .narrowScalarIf(isTruncStoreToSizePowerOf2(0), + getScalarTypeFromMemDesc(0)) + .widenScalarToNextPow2(0) + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) + .lower(); } // FIXME: Unaligned accesses not lowered. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp index e65dd1b04cc4..dfe7c53aaca0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMemoryUtils.cpp @@ -384,7 +384,7 @@ bool isClobberedInFunction(const LoadInst *Load, MemorySSA *MSSA, AAResults *AA) { MemorySSAWalker *Walker = MSSA->getWalker(); SmallVector WorkList{Walker->getClobberingMemoryAccess(Load)}; - SmallSet Visited; + SmallPtrSet Visited; MemoryLocation Loc(MemoryLocation::get(Load)); LLVM_DEBUG(dbgs() << "Checking clobbering of: " << *Load << '\n'); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp index 3a3751892c8b..28d5400fd180 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -134,8 +134,8 @@ static std::pair getMemoryInstrPtrAndType( bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); - SmallSet WorkSet; - SmallSet Visited; + SmallPtrSet WorkSet; + SmallPtrSet Visited; if (const Value *MO = getMemoryInstrPtrAndType(Inst).first) { if (isGlobalAddr(MO)) WorkSet.insert(MO); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp index 984c1ee89309..a386fe621a55 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreloadKernelArguments.cpp @@ -37,6 +37,11 @@ static cl::opt KernargPreloadCount( "amdgpu-kernarg-preload-count", cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0)); +static cl::opt + EnableKernargPreload("amdgpu-kernarg-preload", + cl::desc("Enable preload kernel arguments to SGPRs"), + cl::init(true)); + namespace { class AMDGPUPreloadKernelArgumentsLegacy : public ModulePass { @@ -275,6 +280,9 @@ AMDGPUPreloadKernelArgumentsLegacy::AMDGPUPreloadKernelArgumentsLegacy( : ModulePass(ID), TM(TM) {} static bool markKernelArgsAsInreg(Module &M, const TargetMachine &TM) { + if (!EnableKernargPreload) + return false; + SmallVector FunctionsToErase; bool Changed = false; for (auto &F : M) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp index bab83483f3de..20b5fd94aba9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteAGPRCopyMFMA.cpp @@ -147,6 +147,9 @@ bool AMDGPURewriteAGPRCopyMFMAImpl::run(MachineFunction &MF) const { // TODO: Test multiple uses for (VNInfo *VNI : LI.vnis()) { + if (VNI->isPHIDef() || VNI->isUnused()) + continue; + MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def); // TODO: Handle SplitKit produced copy bundles for partially defined diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp index b60ded33a4ac..56aa3f6db83a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -195,7 +195,7 @@ bool AMDGPUSetWavePriority::run(MachineFunction &MF) { // Lower the priority on edges where control leaves blocks from which // the VMEM loads are reachable. - SmallSet PriorityLoweringBlocks; + SmallPtrSet PriorityLoweringBlocks; for (MachineBasicBlock &MBB : MF) { if (MBBInfos[&MBB].MayReachVMEMLoad) { if (MBB.succ_empty()) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 1f291ce5c534..a3b64aee297b 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -1202,6 +1202,12 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) { fixRequiredExportPriority(MI); if (ST.requiresWaitIdleBeforeGetReg()) fixGetRegWaitIdle(MI); + if (ST.hasDsAtomicAsyncBarrierArriveB64PipeBug()) + fixDsAtomicAsyncBarrierArriveB64(MI); + if (ST.hasScratchBaseForwardingHazard()) + fixScratchBaseForwardingHazard(MI); + if (ST.setRegModeNeedsVNOPs()) + fixSetRegMode(MI); } static bool isVCmpXWritesExec(const SIInstrInfo &TII, const SIRegisterInfo &TRI, @@ -1352,6 +1358,9 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) { return (Decoded.DsCnt == 0); } default: + assert((!SIInstrInfo::isWaitcnt(MI.getOpcode()) || + MI.getOpcode() == AMDGPU::S_WAIT_IDLE) && + "unexpected wait count instruction"); // SOPP instructions cannot mitigate the hazard. if (TII->isSOPP(MI)) return false; @@ -1733,7 +1742,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) { BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR)) - .addImm(0x0fff); + .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0)); return true; } @@ -1783,7 +1792,7 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) { if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && - I.getOperand(0).getImm() == 0x0fff)) + AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0)) return HazardExpired; // Track registers writes @@ -2241,19 +2250,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) { if (WaitStates >= 3 || SIInstrInfo::isVALU(MI)) return true; - switch (MI.getOpcode()) { - case AMDGPU::S_WAITCNT: - case AMDGPU::S_WAITCNT_VSCNT: - case AMDGPU::S_WAITCNT_VMCNT: - case AMDGPU::S_WAITCNT_EXPCNT: - case AMDGPU::S_WAITCNT_LGKMCNT: - case AMDGPU::S_WAIT_IDLE: - return true; - default: - break; - } - - return false; + return SIInstrInfo::isWaitcnt(MI.getOpcode()); }; return FPAtomicToDenormModeWaitStates - @@ -3451,3 +3448,104 @@ bool GCNHazardRecognizer::fixGetRegWaitIdle(MachineInstr *MI) { .addImm(0); return true; } + +bool GCNHazardRecognizer::fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI) { + if (MI->getOpcode() != AMDGPU::DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64) + return false; + + const SIInstrInfo *TII = ST.getInstrInfo(); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xFFE3); + BuildMI(*MI->getParent(), std::next(MI->getIterator()), MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xFFE3); + + return true; +} + +bool GCNHazardRecognizer::fixScratchBaseForwardingHazard(MachineInstr *MI) { + // No reason to check this in pre-RA scheduling, SGPRs have to be allocated + // for hazard to trigger. + if (!IsHazardRecognizerMode) + return false; + + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + // Hazard expires after 10 SGPR writes by SALU or 8 SGPR writes by VALU. + const int FlatScrBaseWaitStates = 10; + + bool ReadsFlatScrLo = + MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, TRI); + bool ReadsFlatScrHi = + MI->readsRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, TRI); + if (isSGetReg(MI->getOpcode())) { + switch (getHWReg(TII, *MI)) { + default: + break; + case AMDGPU::Hwreg::ID_FLAT_SCR_LO: + ReadsFlatScrLo = true; + break; + case AMDGPU::Hwreg::ID_FLAT_SCR_HI: + ReadsFlatScrHi = true; + break; + } + } + + const MachineRegisterInfo &MRI = MF.getRegInfo(); + + auto IsRegDefHazard = [&](Register Reg) -> bool { + DenseSet Visited; + auto IsHazardFn = [TRI, Reg](const MachineInstr &MI) { + return MI.modifiesRegister(Reg, TRI); + }; + + // This literally abuses the idea of waitstates. Instead of waitstates it + // returns 1 for SGPR written and 0 otherwise. + auto IsSGPRDef = [TII, TRI, &MRI](const MachineInstr &MI) -> unsigned { + if (!TII->isSALU(MI) && !TII->isVALU(MI)) + return 0; + for (const MachineOperand &MO : MI.all_defs()) { + if (TRI->isSGPRReg(MRI, MO.getReg())) + return 1; + } + return 0; + }; + + auto IsExpiredFn = [=](const MachineInstr &MI, int SgprWrites) { + if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR) { + unsigned Wait = MI.getOperand(0).getImm(); + if (AMDGPU::DepCtr::decodeFieldSaSdst(Wait) == 0 && + AMDGPU::DepCtr::decodeFieldVaSdst(Wait) == 0) + return true; + } + return SgprWrites >= FlatScrBaseWaitStates; + }; + + return ::getWaitStatesSince( + IsHazardFn, MI->getParent(), std::next(MI->getReverseIterator()), + 0, IsExpiredFn, Visited, IsSGPRDef) < FlatScrBaseWaitStates; + }; + + if ((!ReadsFlatScrLo || MRI.isConstantPhysReg(AMDGPU::SGPR102) || + !IsRegDefHazard(AMDGPU::SGPR102)) && + (!ReadsFlatScrHi || MRI.isConstantPhysReg(AMDGPU::SGPR103) || + !IsRegDefHazard(AMDGPU::SGPR103))) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(AMDGPU::DepCtr::encodeFieldVaSdst( + AMDGPU::DepCtr::encodeFieldSaSdst(0), 0)); + return true; +} + +bool GCNHazardRecognizer::fixSetRegMode(MachineInstr *MI) { + if (!isSSetReg(MI->getOpcode()) || + MI->getOperand(1).getImm() != AMDGPU::Hwreg::ID_MODE) + return false; + + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32)); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::V_NOP_e32)); + return true; +} diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h index a078f50219c3..67beffadc091 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h @@ -111,6 +111,9 @@ private: bool fixVALUMaskWriteHazard(MachineInstr *MI); bool fixRequiredExportPriority(MachineInstr *MI); bool fixGetRegWaitIdle(MachineInstr *MI); + bool fixDsAtomicAsyncBarrierArriveB64(MachineInstr *MI); + bool fixScratchBaseForwardingHazard(MachineInstr *MI); + bool fixSetRegMode(MachineInstr *MI); int checkMAIHazards(MachineInstr *MI); int checkMAIHazards908(MachineInstr *MI); diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 92de024cc6fc..2a8385df3f93 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -1345,6 +1345,10 @@ public: bool hasVALUReadSGPRHazard() const { return GFX12Insts && !GFX1250Insts; } + bool setRegModeNeedsVNOPs() const { + return GFX1250Insts && getGeneration() == GFX12; + } + /// Return if operations acting on VGPR tuples require even alignment. bool needsAlignedVGPRs() const { return GFX90AInsts || GFX1250Insts; } @@ -1815,6 +1819,18 @@ public: // to the same register. return false; } + + // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything + // and surronded by S_WAIT_ALU(0xFFE3). + bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const { + return getGeneration() == GFX12; + } + + // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base + // read. + bool hasScratchBaseForwardingHazard() const { + return GFX1250Insts && getGeneration() == GFX12; + } }; class GCNUserSGPRUsageInfo { diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 1f35e92151bf..e20581d76fcd 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/AMDGPUMetadata.h" #include "llvm/Support/AMDHSAKernelDescriptor.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/TargetParser/TargetParser.h" @@ -886,7 +885,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size, if (!SymbolELF->isBindingSet()) SymbolELF->setBinding(ELF::STB_GLOBAL); - if (SymbolELF->declareCommon(Size, Alignment, true)) { + if (SymbolELF->declareCommon(Size, Alignment)) { report_fatal_error("Symbol: " + Symbol->getName() + " redeclared as different type"); } diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index f018f77bc83e..dce4e6f99300 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -460,7 +460,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg, // List of clobbering instructions. SmallVector Clobbers; // List of instructions marked for deletion. - SmallSet MergedInstrs; + SmallPtrSet MergedInstrs; bool Changed = false; @@ -808,7 +808,7 @@ bool SIFixSGPRCopies::run(MachineFunction &MF) { void SIFixSGPRCopies::processPHINode(MachineInstr &MI) { bool AllAGPRUses = true; SetVector worklist; - SmallSet Visited; + SmallPtrSet Visited; SetVector PHIOperands; worklist.insert(&MI); Visited.insert(&MI); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 18f0e5b9b56b..5cbf6f5ab045 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1056,7 +1056,7 @@ public: } } - bool isWaitcnt(unsigned Opcode) const { + static bool isWaitcnt(unsigned Opcode) { switch (getNonSoftWaitcntOpcode(Opcode)) { case AMDGPU::S_WAITCNT: case AMDGPU::S_WAITCNT_VSCNT: diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bd5dfa92a8e4..6488fa3dacfb 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3056,6 +3056,8 @@ def : GCNPat< } } // AddedComplexity = 1 +foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in +let True16Predicate = p in { def : GCNPat< (i32 (DivergentUnaryFrag i16:$src)), (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) @@ -3071,6 +3073,26 @@ def : GCNPat< def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src)>; +} + +let True16Predicate = UseRealTrue16Insts in { +def : GCNPat< + (i32 (DivergentUnaryFrag i16:$src)), + (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag i16:$src)), + (REG_SEQUENCE VReg_64, + (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + +def : GCNPat< + (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), + (REG_SEQUENCE VGPR_32, $src, lo16, (V_MOV_B16_t16_e64 0, (i16 0), 0), hi16) +>; +} def : GCNPat < (i32 (trunc i64:$a)), diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f7a9a584a6b5..e97536d36bab 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -81,7 +81,7 @@ private: MachineRegisterInfo *MRI = nullptr; SetVector LoweredEndCf; DenseSet LoweredIf; - SmallSet KillBlocks; + SmallPtrSet KillBlocks; SmallSet RecomputeRegs; const TargetRegisterClass *BoolRC = nullptr; @@ -460,7 +460,7 @@ MachineBasicBlock::iterator SILowerControlFlow::skipIgnoreExecInstsTrivialSucc( MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const { - SmallSet Visited; + SmallPtrSet Visited; MachineBasicBlock *B = &MBB; do { if (!Visited.insert(B).second) diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp index ef690838f0f3..c53e2158f4c7 100644 --- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp +++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp @@ -109,7 +109,7 @@ namespace { /// NewWaterList - The subset of WaterList that was created since the /// previous iteration by inserting unconditional branches. - SmallSet NewWaterList; + SmallPtrSet NewWaterList; using water_iterator = std::vector::iterator; diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp index 8ee3a2d26a59..a5266a9df318 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp @@ -20,7 +20,6 @@ #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSymbolMachO.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" using namespace llvm; diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp index 0b4e7dfebe36..5eeb4fe99548 100644 --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -922,7 +922,7 @@ bool MVETPAndVPTOptimisations::ReplaceConstByVPNOTs(MachineBasicBlock &MBB, // the function. unsigned LastVPTImm = 0; Register LastVPTReg = 0; - SmallSet DeadInstructions; + SmallPtrSet DeadInstructions; for (MachineInstr &Instr : MBB.instrs()) { // Look for predicated MVE instructions. diff --git a/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp index 4dd8660589cf..b1ef38047c07 100644 --- a/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp +++ b/llvm/lib/Target/AVR/AVRTargetTransformInfo.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "AVRTargetTransformInfo.h" -#include "llvm/CodeGen/CostTable.h" using namespace llvm; diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp index e55d9b227d1c..7885d93cbad9 100644 --- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp +++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp @@ -116,7 +116,7 @@ class CSKYConstantIslands : public MachineFunctionPass { /// NewWaterList - The subset of WaterList that was created since the /// previous iteration by inserting unconditional branches. - SmallSet NewWaterList; + SmallPtrSet NewWaterList; using water_iterator = std::vector::iterator; diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp index 22cff7c80fa0..bcddb540d35d 100644 --- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp +++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp @@ -526,6 +526,9 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB, MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext()))); break; + case Hexagon::Y4_crswap10: + MI.addOperand(MCOperand::createReg(Hexagon::SGP1_0)); + break; default: break; } diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp index a9201460d8e2..b2218abcaaa3 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -1273,7 +1273,7 @@ void HexagonGenInsert::selectCandidates() { for (unsigned R = AllRMs.find_first(); R; R = AllRMs.find_next(R)) { using use_iterator = MachineRegisterInfo::use_nodbg_iterator; - using InstrSet = SmallSet; + using InstrSet = SmallPtrSet; InstrSet UIs; // Count as the number of instructions in which R is used, not the diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index c34eecd3fcb0..a3717bb97d14 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -2289,7 +2289,7 @@ CleanupAndExit: // the instructions in Insts are removed. bool HexagonLoopIdiomRecognize::coverLoop(Loop *L, SmallVectorImpl &Insts) const { - SmallSet LoopBlocks; + SmallPtrSet LoopBlocks; LoopBlocks.insert_range(L->blocks()); SetVector Worklist(llvm::from_range, Insts); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp index ecc1b5d2ebe3..6a05b5ab2c21 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp @@ -445,8 +445,8 @@ void HexagonSubtarget::adjustSchedDependency( const HexagonInstrInfo *QII = getInstrInfo(); // Instructions with .new operands have zero latency. - SmallSet ExclSrc; - SmallSet ExclDst; + SmallPtrSet ExclSrc; + SmallPtrSet ExclDst; if (QII->canExecuteInBundle(*SrcInst, *DstInst) && isBestZeroLatency(Src, Dst, QII, ExclSrc, ExclDst)) { Dep.setLatency(0); @@ -630,9 +630,9 @@ static SUnit *getZeroLatency(SUnit *N, SmallVector &Deps) { // together with a zero latency. Only one dependence should have a zero // latency. If there are multiple choices, choose the best, and change // the others, if needed. -bool HexagonSubtarget::isBestZeroLatency(SUnit *Src, SUnit *Dst, - const HexagonInstrInfo *TII, SmallSet &ExclSrc, - SmallSet &ExclDst) const { +bool HexagonSubtarget::isBestZeroLatency( + SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII, + SmallPtrSet &ExclSrc, SmallPtrSet &ExclDst) const { MachineInstr &SrcInst = *Src->getInstr(); MachineInstr &DstInst = *Dst->getInstr(); diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h index 41555db4ac66..b111471a9696 100644 --- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h +++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h @@ -366,7 +366,8 @@ private: void restoreLatency(SUnit *Src, SUnit *Dst) const; void changeLatency(SUnit *Src, SUnit *Dst, unsigned Lat) const; bool isBestZeroLatency(SUnit *Src, SUnit *Dst, const HexagonInstrInfo *TII, - SmallSet &ExclSrc, SmallSet &ExclDst) const; + SmallPtrSet &ExclSrc, + SmallPtrSet &ExclDst) const; }; } // end namespace llvm diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp index 039ef4f543a1..6b8d7f172d7f 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp @@ -32,7 +32,6 @@ #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/HexagonAttributes.h" diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp index ca5d27d54bb8..338134ffcde6 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp @@ -143,8 +143,6 @@ static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data, void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) { - if (IsResolved && shouldForceRelocation(Fixup, Target)) - IsResolved = false; IsResolved = addReloc(F, Fixup, Target, Value, IsResolved); if (!Value) return; // Doesn't change encoding. @@ -176,20 +174,6 @@ void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup, } } -bool LoongArchAsmBackend::shouldForceRelocation(const MCFixup &Fixup, - const MCValue &Target) { - switch (Fixup.getKind()) { - default: - return STI.hasFeature(LoongArch::FeatureRelax); - case FK_Data_1: - case FK_Data_2: - case FK_Data_4: - case FK_Data_8: - case FK_Data_leb128: - return !Target.isAbsolute(); - } -} - static inline std::pair getRelocPairForSize(unsigned Size) { switch (Size) { @@ -216,10 +200,19 @@ getRelocPairForSize(unsigned Size) { // size, the fixup encodes MaxBytesToEmit in the higher bits and references a // per-section marker symbol. bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { + // Alignments before the first linker-relaxable instruction have fixed sizes + // and do not require relocations. Alignments after a linker-relaxable + // instruction require a relocation, even if the STI specifies norelax. + // + // firstLinkerRelaxable is the layout order within the subsection, which may + // be smaller than the section's order. Therefore, alignments in a + // lower-numbered subsection may be unnecessarily treated as linker-relaxable. + auto *Sec = F.getParent(); + if (F.getLayoutOrder() <= Sec->firstLinkerRelaxable()) + return false; + // Use default handling unless linker relaxation is enabled and the // MaxBytesToEmit >= the nop size. - if (!F.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax)) - return false; const unsigned MinNopLen = 4; unsigned MaxBytesToEmit = F.getAlignMaxBytesToEmit(); if (MaxBytesToEmit < MinNopLen) @@ -254,8 +247,6 @@ bool LoongArchAsmBackend::relaxAlign(MCFragment &F, unsigned &Size) { MCFixup::create(0, Expr, FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN); F.setVarFixups({Fixup}); F.setLinkerRelaxable(); - if (!F.getParent()->isLinkerRelaxable()) - F.getParent()->setFirstLinkerRelaxable(F.getLayoutOrder()); return true; } @@ -484,9 +475,16 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, return false; } - IsResolved = Fallback(); // If linker relaxation is enabled and supported by the current relocation, - // append a RELAX relocation. + // generate a relocation and then append a RELAX. + if (Fixup.isLinkerRelaxable()) + IsResolved = false; + if (IsResolved && Fixup.isPCRel()) + IsResolved = isPCRelFixupResolved(Target.getAddSym(), F); + + if (!IsResolved) + Asm->getWriter().recordRelocation(F, Fixup, Target, FixedValue); + if (Fixup.isLinkerRelaxable()) { auto FA = MCFixup::create(Fixup.getOffset(), nullptr, ELF::R_LARCH_RELAX); Asm->getWriter().recordRelocation(F, FA, MCValue::get(nullptr), @@ -498,8 +496,7 @@ bool LoongArchAsmBackend::addReloc(const MCFragment &F, const MCFixup &Fixup, std::unique_ptr LoongArchAsmBackend::createObjectTargetWriter() const { - return createLoongArchELFObjectWriter( - OSABI, Is64Bit, STI.hasFeature(LoongArch::FeatureRelax)); + return createLoongArchELFObjectWriter(OSABI, Is64Bit); } MCAsmBackend *llvm::createLoongArchAsmBackend(const Target &T, diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h index 1f1360119edb..f79d3aa48c54 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h @@ -44,8 +44,6 @@ public: void applyFixup(const MCFragment &, const MCFixup &, const MCValue &Target, uint8_t *Data, uint64_t Value, bool IsResolved) override; - bool shouldForceRelocation(const MCFixup &Fixup, const MCValue &Target); - std::optional getFixupKind(StringRef Name) const override; MCFixupKindInfo getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp index 7e021e486836..7d5456555045 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchELFObjectWriter.cpp @@ -21,26 +21,23 @@ using namespace llvm; namespace { class LoongArchELFObjectWriter : public MCELFObjectTargetWriter { public: - LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool EnableRelax); + LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); ~LoongArchELFObjectWriter() override; bool needsRelocateWithSymbol(const MCValue &, unsigned Type) const override { - return EnableRelax; + return true; } protected: unsigned getRelocType(const MCFixup &, const MCValue &, bool IsPCRel) const override; - bool EnableRelax; }; } // end namespace -LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, - bool EnableRelax) +LoongArchELFObjectWriter::LoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_LOONGARCH, - /*HasRelocationAddend=*/true), - EnableRelax(EnableRelax) {} + /*HasRelocationAddend=*/true) {} LoongArchELFObjectWriter::~LoongArchELFObjectWriter() {} @@ -103,6 +100,6 @@ unsigned LoongArchELFObjectWriter::getRelocType(const MCFixup &Fixup, } std::unique_ptr -llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax) { - return std::make_unique(OSABI, Is64Bit, Relax); +llvm::createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit) { + return std::make_unique(OSABI, Is64Bit); } diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h index bb05baa9b717..ab35a0096c8a 100644 --- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h +++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchMCTargetDesc.h @@ -36,7 +36,7 @@ MCAsmBackend *createLoongArchAsmBackend(const Target &T, const MCTargetOptions &Options); std::unique_ptr -createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit, bool Relax); +createLoongArchELFObjectWriter(uint8_t OSABI, bool Is64Bit); } // end namespace llvm diff --git a/llvm/lib/Target/M68k/M68kInstrData.td b/llvm/lib/Target/M68k/M68kInstrData.td index f4ed62720ff9..c5b7ae332822 100644 --- a/llvm/lib/Target/M68k/M68kInstrData.td +++ b/llvm/lib/Target/M68k/M68kInstrData.td @@ -701,18 +701,22 @@ def: Pat<(MxExtLoadi16i8 MxCP_ARID:$src), (EXTRACT_SUBREG (MOVZXd32p8 MxARID8:$src), MxSubRegIndex16Lo)>; def: Pat<(MxExtLoadi16i8 MxCP_ARII:$src), (EXTRACT_SUBREG (MOVZXd32f8 MxARII8:$src), MxSubRegIndex16Lo)>; +def: Pat<(MxExtLoadi16i8 MxCP_PCD:$src), + (EXTRACT_SUBREG (MOVZXd32q8 MxPCD8:$src), MxSubRegIndex16Lo)>; // i32 <- anyext i8 def: Pat<(i32 (anyext i8:$src)), (MOVZXd32d8 MxDRD8:$src)>; def: Pat<(MxExtLoadi32i8 MxCP_ARI :$src), (MOVZXd32j8 MxARI8 :$src)>; def: Pat<(MxExtLoadi32i8 MxCP_ARID:$src), (MOVZXd32p8 MxARID8:$src)>; def: Pat<(MxExtLoadi32i8 MxCP_ARII:$src), (MOVZXd32f8 MxARII8:$src)>; +def: Pat<(MxExtLoadi32i8 MxCP_PCD:$src), (MOVZXd32q8 MxPCD8:$src)>; // i32 <- anyext i16 def: Pat<(i32 (anyext i16:$src)), (MOVZXd32d16 MxDRD16:$src)>; def: Pat<(MxExtLoadi32i16 MxCP_ARI :$src), (MOVZXd32j16 MxARI16 :$src)>; def: Pat<(MxExtLoadi32i16 MxCP_ARID:$src), (MOVZXd32p16 MxARID16:$src)>; def: Pat<(MxExtLoadi32i16 MxCP_ARII:$src), (MOVZXd32f16 MxARII16:$src)>; +def: Pat<(MxExtLoadi32i16 MxCP_PCD:$src), (MOVZXd32q16 MxPCD16:$src)>; // trunc patterns def : Pat<(i16 (trunc i32:$src)), diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp index 16247bd73268..680d27927ba3 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp @@ -17,7 +17,6 @@ #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp index feeadc5e2297..a8b7c9e57470 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp @@ -18,7 +18,6 @@ #include "llvm/MC/MCInst.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Casting.h" using namespace llvm; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 5df70c4675c0..1e1b9703d806 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FormattedStream.h" diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp index 4530fc60e5e2..ae91c97e2a80 100644 --- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp +++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp @@ -51,7 +51,6 @@ #include "llvm/MC/MCObjectFileInfo.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSymbol.h" -#include "llvm/MC/MCSymbolELF.h" #include "llvm/MC/TargetRegistry.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp index d600343860b0..d7b5633d7077 100644 --- a/llvm/lib/Target/Mips/MipsCCState.cpp +++ b/llvm/lib/Target/Mips/MipsCCState.cpp @@ -12,31 +12,9 @@ using namespace llvm; -bool MipsCCState::isF128SoftLibCall(const char *CallSym) { - const char *const LibCalls[] = { - "__addtf3", "__divtf3", "__eqtf2", "__extenddftf2", - "__extendsftf2", "__fixtfdi", "__fixtfsi", "__fixtfti", - "__fixunstfdi", "__fixunstfsi", "__fixunstfti", "__floatditf", - "__floatsitf", "__floattitf", "__floatunditf", "__floatunsitf", - "__floatuntitf", "__getf2", "__gttf2", "__letf2", - "__lttf2", "__multf3", "__netf2", "__powitf2", - "__subtf3", "__trunctfdf2", "__trunctfsf2", "__unordtf2", - "ceill", "copysignl", "cosl", "exp2l", - "expl", "floorl", "fmal", "fmaxl", - "fmodl", "frexpl", "log10l", "log2l", - "logl", "nearbyintl", "powl", "rintl", - "roundl", "sincosl", "sinl", "sqrtl", - "truncl"}; - - // Check that LibCalls is sorted alphabetically. - auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; }; - assert(llvm::is_sorted(LibCalls, Comp)); - return llvm::binary_search(LibCalls, CallSym, Comp); -} - /// This function returns true if Ty is fp128, {f128} or i128 which was /// originally a fp128. -bool MipsCCState::originalTypeIsF128(const Type *Ty, const char *Func) { +bool MipsCCState::originalTypeIsF128(const Type *Ty) { if (Ty->isFP128Ty()) return true; @@ -44,10 +22,7 @@ bool MipsCCState::originalTypeIsF128(const Type *Ty, const char *Func) { Ty->getStructElementType(0)->isFP128Ty()) return true; - // If the Ty is i128 and the function being called is a long double emulation - // routine, then the original type is f128. - // FIXME: This is unsound because these functions could be indirectly called - return (Func && Ty->isIntegerTy(128) && isF128SoftLibCall(Func)); + return false; } /// Return true if the original type was vXfXX. @@ -84,11 +59,9 @@ MipsCCState::getSpecialCallingConvForCallee(const SDNode *Callee, } void MipsCCState::PreAnalyzeCallResultForF128( - const SmallVectorImpl &Ins, - const Type *RetTy, const char *Call) { + const SmallVectorImpl &Ins, const Type *RetTy) { for (unsigned i = 0; i < Ins.size(); ++i) { - OriginalArgWasF128.push_back( - originalTypeIsF128(RetTy, Call)); + OriginalArgWasF128.push_back(originalTypeIsF128(RetTy)); OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy()); } } @@ -98,8 +71,7 @@ void MipsCCState::PreAnalyzeCallResultForF128( void MipsCCState::PreAnalyzeCallReturnForF128( const SmallVectorImpl &Outs, const Type *RetTy) { for (unsigned i = 0; i < Outs.size(); ++i) { - OriginalArgWasF128.push_back( - originalTypeIsF128(RetTy, nullptr)); + OriginalArgWasF128.push_back(originalTypeIsF128(RetTy)); OriginalArgWasFloat.push_back( RetTy->isFloatingPointTy()); } @@ -129,8 +101,8 @@ void MipsCCState::PreAnalyzeReturnValue(EVT ArgVT) { OriginalRetWasFloatVector.push_back(originalEVTTypeIsVectorFloat(ArgVT)); } -void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) { - OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, Func)); +void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy) { + OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy)); OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); OriginalArgWasFloatVector.push_back(ArgTy->isVectorTy()); } @@ -139,14 +111,13 @@ void MipsCCState::PreAnalyzeCallOperand(const Type *ArgTy, const char *Func) { /// arguments and record this. void MipsCCState::PreAnalyzeCallOperands( const SmallVectorImpl &Outs, - std::vector &FuncArgs, - const char *Func) { + std::vector &FuncArgs) { for (unsigned i = 0; i < Outs.size(); ++i) { TargetLowering::ArgListEntry FuncArg = FuncArgs[Outs[i].OrigArgIndex]; - OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.Ty, Func)); - OriginalArgWasFloat.push_back(FuncArg.Ty->isFloatingPointTy()); - OriginalArgWasFloatVector.push_back(FuncArg.Ty->isVectorTy()); + OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg.OrigTy)); + OriginalArgWasFloat.push_back(FuncArg.OrigTy->isFloatingPointTy()); + OriginalArgWasFloatVector.push_back(FuncArg.OrigTy->isVectorTy()); } } @@ -162,7 +133,7 @@ void MipsCCState::PreAnalyzeFormalArgument(const Type *ArgTy, return; } - OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy, nullptr)); + OriginalArgWasF128.push_back(originalTypeIsF128(ArgTy)); OriginalArgWasFloat.push_back(ArgTy->isFloatingPointTy()); // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the @@ -192,8 +163,7 @@ void MipsCCState::PreAnalyzeFormalArgumentsForF128( assert(Ins[i].getOrigArgIndex() < MF.getFunction().arg_size()); std::advance(FuncArg, Ins[i].getOrigArgIndex()); - OriginalArgWasF128.push_back( - originalTypeIsF128(FuncArg->getType(), nullptr)); + OriginalArgWasF128.push_back(originalTypeIsF128(FuncArg->getType())); OriginalArgWasFloat.push_back(FuncArg->getType()->isFloatingPointTy()); // The MIPS vector ABI exhibits a corner case of sorts or quirk; if the diff --git a/llvm/lib/Target/Mips/MipsCCState.h b/llvm/lib/Target/Mips/MipsCCState.h index 30b68e8a9c96..4d985518ce7c 100644 --- a/llvm/lib/Target/Mips/MipsCCState.h +++ b/llvm/lib/Target/Mips/MipsCCState.h @@ -26,17 +26,11 @@ public: getSpecialCallingConvForCallee(const SDNode *Callee, const MipsSubtarget &Subtarget); - /// This function returns true if CallSym is a long double emulation routine. - /// - /// FIXME: Changing the ABI based on the callee name is unsound. The lib func - /// address could be captured. - static bool isF128SoftLibCall(const char *CallSym); - - static bool originalTypeIsF128(const Type *Ty, const char *Func); + static bool originalTypeIsF128(const Type *Ty); static bool originalEVTTypeIsVectorFloat(EVT Ty); static bool originalTypeIsVectorFloat(const Type *Ty); - void PreAnalyzeCallOperand(const Type *ArgTy, const char *Func); + void PreAnalyzeCallOperand(const Type *ArgTy); void PreAnalyzeFormalArgument(const Type *ArgTy, ISD::ArgFlagsTy Flags); void PreAnalyzeReturnValue(EVT ArgVT); @@ -45,7 +39,7 @@ private: /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. void PreAnalyzeCallResultForF128(const SmallVectorImpl &Ins, - const Type *RetTy, const char * Func); + const Type *RetTy); /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. @@ -55,8 +49,7 @@ private: /// this. void PreAnalyzeCallOperands(const SmallVectorImpl &Outs, - std::vector &FuncArgs, - const char *Func); + std::vector &FuncArgs); /// Identify lowered values that originated from f128 arguments and record /// this for use by RetCC_MipsN. @@ -96,21 +89,21 @@ public: SpecialCallingConvType SpecialCC = NoSpecialCallingConv) : CCState(CC, isVarArg, MF, locs, C), SpecialCallingConv(SpecialCC) {} - void PreAnalyzeCallOperands( - const SmallVectorImpl &Outs, CCAssignFn Fn, - std::vector &FuncArgs, const char *Func) { + void + PreAnalyzeCallOperands(const SmallVectorImpl &Outs, + CCAssignFn Fn, + std::vector &FuncArgs) { OriginalArgWasF128.clear(); OriginalArgWasFloat.clear(); OriginalArgWasFloatVector.clear(); - PreAnalyzeCallOperands(Outs, FuncArgs, Func); + PreAnalyzeCallOperands(Outs, FuncArgs); } void AnalyzeCallOperands(const SmallVectorImpl &Outs, CCAssignFn Fn, - std::vector &FuncArgs, - const char *Func) { - PreAnalyzeCallOperands(Outs, Fn, FuncArgs, Func); + std::vector &FuncArgs) { + PreAnalyzeCallOperands(Outs, Fn, FuncArgs); CCState::AnalyzeCallOperands(Outs, Fn); } @@ -137,26 +130,24 @@ public: CCState::AnalyzeFormalArguments(Ins, Fn); } - void PreAnalyzeCallResult(const Type *RetTy, const char *Func) { - OriginalArgWasF128.push_back(originalTypeIsF128(RetTy, Func)); + void PreAnalyzeCallResult(const Type *RetTy) { + OriginalArgWasF128.push_back(originalTypeIsF128(RetTy)); OriginalArgWasFloat.push_back(RetTy->isFloatingPointTy()); OriginalRetWasFloatVector.push_back(originalTypeIsVectorFloat(RetTy)); } void PreAnalyzeCallResult(const SmallVectorImpl &Ins, - CCAssignFn Fn, const Type *RetTy, - const char *Func) { + CCAssignFn Fn, const Type *RetTy) { OriginalArgWasFloat.clear(); OriginalArgWasF128.clear(); OriginalArgWasFloatVector.clear(); - PreAnalyzeCallResultForF128(Ins, RetTy, Func); + PreAnalyzeCallResultForF128(Ins, RetTy); PreAnalyzeCallResultForVectorFloat(Ins, RetTy); } void AnalyzeCallResult(const SmallVectorImpl &Ins, - CCAssignFn Fn, const Type *RetTy, - const char *Func) { - PreAnalyzeCallResult(Ins, Fn, RetTy, Func); + CCAssignFn Fn, const Type *RetTy) { + PreAnalyzeCallResult(Ins, Fn, RetTy); CCState::AnalyzeCallResult(Ins, Fn); } diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index fa491086b0ac..5b6734620973 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -27,16 +27,11 @@ MipsCallLowering::MipsCallLowering(const MipsTargetLowering &TLI) namespace { struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { - /// This is the name of the function being called - /// FIXME: Relying on this is unsound - const char *Func = nullptr; - /// Is this a return value, or an outgoing call operand. bool IsReturn; - MipsOutgoingValueAssigner(CCAssignFn *AssignFn_, const char *Func, - bool IsReturn) - : OutgoingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {} + MipsOutgoingValueAssigner(CCAssignFn *AssignFn_, bool IsReturn) + : OutgoingValueAssigner(AssignFn_), IsReturn(IsReturn) {} bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -47,7 +42,7 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { if (IsReturn) State.PreAnalyzeReturnValue(EVT::getEVT(Info.Ty)); else - State.PreAnalyzeCallOperand(Info.Ty, Func); + State.PreAnalyzeCallOperand(Info.Ty); return CallLowering::OutgoingValueAssigner::assignArg( ValNo, OrigVT, ValVT, LocVT, LocInfo, Info, Flags, State); @@ -55,16 +50,11 @@ struct MipsOutgoingValueAssigner : public CallLowering::OutgoingValueAssigner { }; struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner { - /// This is the name of the function being called - /// FIXME: Relying on this is unsound - const char *Func = nullptr; - /// Is this a call return value, or an incoming function argument. bool IsReturn; - MipsIncomingValueAssigner(CCAssignFn *AssignFn_, const char *Func, - bool IsReturn) - : IncomingValueAssigner(AssignFn_), Func(Func), IsReturn(IsReturn) {} + MipsIncomingValueAssigner(CCAssignFn *AssignFn_, bool IsReturn) + : IncomingValueAssigner(AssignFn_), IsReturn(IsReturn) {} bool assignArg(unsigned ValNo, EVT OrigVT, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, @@ -73,7 +63,7 @@ struct MipsIncomingValueAssigner : public CallLowering::IncomingValueAssigner { MipsCCState &State = static_cast(State_); if (IsReturn) - State.PreAnalyzeCallResult(Info.Ty, Func); + State.PreAnalyzeCallResult(Info.Ty); else State.PreAnalyzeFormalArgument(Info.Ty, Flags); @@ -339,9 +329,8 @@ bool MipsCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, F.getContext()); MipsOutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret); - std::string FuncName = F.getName().str(); MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForReturn(), - FuncName.c_str(), /*IsReturn*/ true); + /*IsReturn*/ true); if (!determineAssignments(Assigner, RetInfos, CCInfo)) return false; @@ -392,8 +381,7 @@ bool MipsCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()), Align(1)); - const std::string FuncName = F.getName().str(); - MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForCall(), FuncName.c_str(), + MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForCall(), /*IsReturn*/ false); if (!determineAssignments(Assigner, ArgInfos, CCInfo)) return false; @@ -510,10 +498,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv), Align(1)); - const char *Call = - Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr; - - MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForCall(), Call, + MipsOutgoingValueAssigner Assigner(TLI.CCAssignFnForCall(), /*IsReturn*/ false); if (!determineAssignments(Assigner, ArgInfos, CCInfo)) return false; @@ -550,10 +535,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, CallLowering::splitToValueTypes(Info.OrigRet, ArgInfos, DL, F.getCallingConv()); - const std::string FuncName = F.getName().str(); SmallVector ArgLocs; MipsIncomingValueAssigner Assigner(TLI.CCAssignFnForReturn(), - FuncName.c_str(), /*IsReturn*/ true); CallReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), MIB); diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp index 8067dbc54170..2a2ccf7d43b8 100644 --- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp +++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp @@ -232,7 +232,7 @@ namespace { /// NewWaterList - The subset of WaterList that was created since the /// previous iteration by inserting unconditional branches. - SmallSet NewWaterList; + SmallPtrSet NewWaterList; using water_iterator = std::vector::iterator; diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp index a9ac0eae5dac..94fb3cc35681 100644 --- a/llvm/lib/Target/Mips/MipsFastISel.cpp +++ b/llvm/lib/Target/Mips/MipsFastISel.cpp @@ -1293,9 +1293,7 @@ bool MipsFastISel::finishCall(CallLoweringInfo &CLI, MVT RetVT, SmallVector RVLocs; MipsCCState CCInfo(CC, false, *FuncInfo.MF, RVLocs, *Context); - CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy, - CLI.Symbol ? CLI.Symbol->getName().data() - : nullptr); + CCInfo.AnalyzeCallResult(CLI.Ins, RetCC_Mips, CLI.RetTy); // Only handle a single return value. if (RVLocs.size() != 1) diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp index ed626f2d7478..466c13e78fbd 100644 --- a/llvm/lib/Target/Mips/MipsISelLowering.cpp +++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -3391,8 +3391,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv); CCInfo.AllocateStack(ReservedArgArea, Align(1)); - CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(), - ES ? ES->getSymbol() : nullptr); + CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs()); // Get a count of how many bytes are to be pushed on the stack. unsigned StackSize = CCInfo.getStackSize(); @@ -3687,10 +3686,7 @@ SDValue MipsTargetLowering::LowerCallResult( MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); - const ExternalSymbolSDNode *ES = - dyn_cast_or_null(CLI.Callee.getNode()); - CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.RetTy, - ES ? ES->getSymbol() : nullptr); + CCInfo.AnalyzeCallResult(Ins, RetCC_Mips, CLI.OrigRetTy); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index b94cbd0bd9c1..74e6c139c610 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -28,7 +28,6 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" -#include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp index 46aa27e1450a..c8e576f976f6 100644 --- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp +++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp @@ -93,7 +93,7 @@ static bool clobbersCTR(const MachineInstr &MI) { static bool verifyCTRBranch(MachineBasicBlock *MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator BI = I; - SmallSet Visited; + SmallPtrSet Visited; SmallVector Preds; bool CheckPreds; diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 4ab9461fc0af..652edd4e04c6 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1787,11 +1787,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L"; case PPCISD::PADDI_DTPREL: return "PPCISD::PADDI_DTPREL"; - case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT"; - case PPCISD::SC: return "PPCISD::SC"; - case PPCISD::CLRBHRB: return "PPCISD::CLRBHRB"; - case PPCISD::MFBHRBE: return "PPCISD::MFBHRBE"; - case PPCISD::RFEBB: return "PPCISD::RFEBB"; + case PPCISD::VADD_SPLAT: + return "PPCISD::VADD_SPLAT"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; case PPCISD::BUILD_FP128: return "PPCISD::BUILD_FP128"; @@ -14817,9 +14814,9 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) { SDValue Chain = LD->getChain(); EVT VT = LD->getMemoryVT(); - SmallSet LoadRoots; + SmallPtrSet LoadRoots; SmallVector Queue(1, Chain.getNode()); - SmallSet Visited; + SmallPtrSet Visited; // First, search up the chain, branching to follow all token-factor operands. // If we find a consecutive load, then we're done, otherwise, record all diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 9755f0e272d1..5e0d6bf184f2 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -430,20 +430,6 @@ namespace llvm { /// optimizations due to constant folding. VADD_SPLAT, - /// CHAIN = SC CHAIN, Imm128 - System call. The 7-bit unsigned - /// operand identifies the operating system entry point. - SC, - - /// CHAIN = CLRBHRB CHAIN - Clear branch history rolling buffer. - CLRBHRB, - - /// GPRC, CHAIN = MFBHRBE CHAIN, Entry, Dummy - Move from branch - /// history rolling buffer entry. - MFBHRBE, - - /// CHAIN = RFEBB CHAIN, State - Return from event-based branch. - RFEBB, - /// VSRC, CHAIN = XXSWAPD CHAIN, VSRC - Occurs only for little /// endian. Maps to an xxswapd instruction that corrects an lxvd2x /// or stxvd2x instruction. The chain is necessary because the diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index 99ef89a7fdc0..c2f91ce8e6b9 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -365,16 +365,6 @@ def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>, [SDNPHasChain, SDNPSideEffect]>; -def SDT_PPCsc : SDTypeProfile<0, 1, [SDTCisInt<0>]>; -def PPCsc : SDNode<"PPCISD::SC", SDT_PPCsc, - [SDNPHasChain, SDNPSideEffect]>; - -def PPCclrbhrb : SDNode<"PPCISD::CLRBHRB", SDTNone, - [SDNPHasChain, SDNPSideEffect]>; -def PPCmfbhrbe : SDNode<"PPCISD::MFBHRBE", SDTIntBinOp, [SDNPHasChain]>; -def PPCrfebb : SDNode<"PPCISD::RFEBB", SDT_PPCsc, - [SDNPHasChain, SDNPSideEffect]>; - def PPCvcmp : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>; def PPCvcmp_rec : SDNode<"PPCISD::VCMP_rec", SDT_PPCvcmp, [SDNPOutGlue]>; @@ -1673,7 +1663,7 @@ let isBranch = 1, isTerminator = 1, Size = 0 in { // System call. let PPC970_Unit = 7 in { def SC : SCForm<17, 1, 0, (outs), (ins i32imm:$LEV), - "sc $LEV", IIC_BrB, [(PPCsc (i32 imm:$LEV))]>; + "sc $LEV", IIC_BrB, []>; } // We mark SCV as having no scheduling model since it is only meant to be used @@ -1685,21 +1675,14 @@ let Predicates = [IsISA3_0], hasNoSchedulingInfo = 1 in { } // Branch history rolling buffer. -def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, - [(PPCclrbhrb)]>, +def CLRBHRB : XForm_0<31, 430, (outs), (ins), "clrbhrb", IIC_BrB, []>, PPC970_DGroup_Single; -// The $dmy argument used for MFBHRBE is not needed; however, including -// it avoids automatic generation of PPCFastISel::fastEmit_i(), which -// interferes with necessary special handling (see PPCFastISel.cpp). -def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT), - (ins u10imm:$imm, u10imm:$dmy), - "mfbhrbe $RT, $imm", IIC_BrB, - [(set i32:$RT, - (PPCmfbhrbe imm:$imm, imm:$dmy))]>, + +def MFBHRBE : XFXForm_3p<31, 302, (outs gprc:$RT), (ins u10imm:$imm), + "mfbhrbe $RT, $imm", IIC_BrB, []>, PPC970_DGroup_First; -def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S", - IIC_BrB, [(PPCrfebb (i32 imm:$S))]>, +def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$S), "rfebb $S", IIC_BrB, []>, PPC970_DGroup_Single; def : InstAlias<"rfebb", (RFEBB 1)>; diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp index 709d7e7e9b47..adf9436b34cc 100644 --- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp +++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp @@ -264,9 +264,8 @@ namespace { bool prepareBasesForCommoningChains(Bucket &BucketChain); /// Rewrite load/store according to the common chains. - bool - rewriteLoadStoresForCommoningChains(Loop *L, Bucket &Bucket, - SmallSet &BBChanged); + bool rewriteLoadStoresForCommoningChains( + Loop *L, Bucket &Bucket, SmallPtrSet &BBChanged); /// Collect condition matched(\p isValidCandidate() returns true) /// candidates in Loop \p L. @@ -309,7 +308,7 @@ namespace { /// Rewrite load/store instructions in \p BucketChain according to /// preparation. bool rewriteLoadStores(Loop *L, Bucket &BucketChain, - SmallSet &BBChanged, + SmallPtrSet &BBChanged, PrepForm Form); /// Rewrite for the base load/store of a chain. @@ -523,7 +522,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, if (Buckets.empty()) return MadeChange; - SmallSet BBChanged; + SmallPtrSet BBChanged; for (auto &Bucket : Buckets) { if (prepareBasesForCommoningChains(Bucket)) @@ -537,7 +536,7 @@ bool PPCLoopInstrFormPrep::chainCommoning(Loop *L, } bool PPCLoopInstrFormPrep::rewriteLoadStoresForCommoningChains( - Loop *L, Bucket &Bucket, SmallSet &BBChanged) { + Loop *L, Bucket &Bucket, SmallPtrSet &BBChanged) { bool MadeChange = false; assert(Bucket.Elements.size() == @@ -1006,7 +1005,7 @@ bool PPCLoopInstrFormPrep::prepareBaseForUpdateFormChain(Bucket &BucketChain) { } bool PPCLoopInstrFormPrep::rewriteLoadStores( - Loop *L, Bucket &BucketChain, SmallSet &BBChanged, + Loop *L, Bucket &BucketChain, SmallPtrSet &BBChanged, PrepForm Form) { bool MadeChange = false; @@ -1089,7 +1088,7 @@ bool PPCLoopInstrFormPrep::updateFormPrep(Loop *L, bool MadeChange = false; if (Buckets.empty()) return MadeChange; - SmallSet BBChanged; + SmallPtrSet BBChanged; for (auto &Bucket : Buckets) // The base address of each bucket is transformed into a phi and the others // are rewritten based on new base. @@ -1110,7 +1109,7 @@ bool PPCLoopInstrFormPrep::dispFormPrep(Loop *L, if (Buckets.empty()) return MadeChange; - SmallSet BBChanged; + SmallPtrSet BBChanged; for (auto &Bucket : Buckets) { if (Bucket.Elements.size() < DispFormPrepMinThreshold) continue; diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp index d71c42c0a5fc..d37ae2ffb7e3 100644 --- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp +++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp @@ -903,6 +903,7 @@ public: VK == RISCV::S_QC_ABS20; } + bool isSImm8Unsigned() const { return isSImm<8>() || isUImm<8>(); } bool isSImm10Unsigned() const { return isSImm<10>() || isUImm<10>(); } bool isUImm20LUI() const { @@ -1199,6 +1200,14 @@ public: addExpr(Inst, getImm(), isRV64Imm()); } + void addSImm8UnsignedOperands(MCInst &Inst, unsigned N) const { + assert(N == 1 && "Invalid number of operands!"); + int64_t Imm; + [[maybe_unused]] bool IsConstant = evaluateConstantImm(getImm(), Imm); + assert(IsConstant); + Inst.addOperand(MCOperand::createImm(SignExtend64<8>(Imm))); + } + void addSImm10UnsignedOperands(MCInst &Inst, unsigned N) const { assert(N == 1 && "Invalid number of operands!"); int64_t Imm; @@ -1547,6 +1556,9 @@ bool RISCVAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return generateImmOutOfRangeError( Operands, ErrorInfo, 0, (1 << 9) - 8, "immediate must be a multiple of 8 bytes in the range"); + case Match_InvalidSImm8Unsigned: + return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 7), + (1 << 8) - 1); case Match_InvalidSImm10: return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 9), (1 << 9) - 1); diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp index e0ac59141695..78be55b3a51d 100644 --- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp +++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp @@ -672,6 +672,8 @@ static constexpr FeatureBitset XAndesGroup = { RISCV::FeatureVendorXAndesVSIntLoad, RISCV::FeatureVendorXAndesVPackFPH, RISCV::FeatureVendorXAndesVDot}; +static constexpr FeatureBitset XSMTGroup = {RISCV::FeatureVendorXSMTVDot}; + static constexpr DecoderListEntry DecoderList32[]{ // Vendor Extensions {DecoderTableXCV32, XCVFeatureGroup, "CORE-V extensions"}, @@ -692,6 +694,7 @@ static constexpr DecoderListEntry DecoderList32[]{ {RISCV::FeatureVendorXMIPSCBOP}, "MIPS mips.pref"}, {DecoderTableXAndes32, XAndesGroup, "Andes extensions"}, + {DecoderTableXSMT32, XSMTGroup, "SpacemiT extensions"}, // Standard Extensions {DecoderTable32, {}, "standard 32-bit instructions"}, {DecoderTableRV32Only32, {}, "RV32-only standard 32-bit instructions"}, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h index 9d26fc01bf37..083ac056ac72 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h @@ -346,6 +346,7 @@ enum OperandType : unsigned { OPERAND_SIMM5_PLUS1, OPERAND_SIMM6, OPERAND_SIMM6_NONZERO, + OPERAND_SIMM8, OPERAND_SIMM10, OPERAND_SIMM10_LSB0000_NONZERO, OPERAND_SIMM11, diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp index 8c9ab8effa71..b0c27ce6010f 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp @@ -75,7 +75,7 @@ void RISCVInstPrinter::printInst(const MCInst *MI, uint64_t Address, if (PrintAliases && !NoAliases) Res = RISCVRVC::uncompress(UncompressedMI, *MI, STI); if (Res) - NewMI = const_cast(&UncompressedMI); + NewMI = &UncompressedMI; if (!PrintAliases || NoAliases || !printAliasInstr(NewMI, Address, STI, O)) printInstruction(NewMI, Address, STI, O); printAnnotation(O, Annot); diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td index a7329d201f88..d4ac3c60e2d4 100644 --- a/llvm/lib/Target/RISCV/RISCVFeatures.td +++ b/llvm/lib/Target/RISCV/RISCVFeatures.td @@ -1055,13 +1055,13 @@ def FeatureStdExtSupm "Indicates User-mode Pointer Masking">; def FeatureStdExtSmctr - : RISCVExperimentalExtension<1, 0, - "Control Transfer Records Machine Level", - [FeatureStdExtSscsrind]>; + : RISCVExtension<1, 0, + "Control Transfer Records Machine Level", + [FeatureStdExtSscsrind]>; def FeatureStdExtSsctr - : RISCVExperimentalExtension<1, 0, - "Control Transfer Records Supervisor Level", - [FeatureStdExtSscsrind]>; + : RISCVExtension<1, 0, + "Control Transfer Records Supervisor Level", + [FeatureStdExtSscsrind]>; def HasStdExtSmctrOrSsctr : Predicate<"Subtarget->hasStdExtSmctrOrSsctr()">, AssemblerPredicate<(any_of FeatureStdExtSmctr, FeatureStdExtSsctr), "'Smctr' (Control Transfer Records Machine Level) or " @@ -1642,6 +1642,14 @@ def HasVendorXAndesVDot AssemblerPredicate<(all_of FeatureVendorXAndesVDot), "'XAndesVDot' (Andes Vector Dot Product Extension)">; +def FeatureVendorXSMTVDot + : RISCVExtension<1, 0, "SpacemiT Vector Dot Product Extension", + [FeatureStdExtZve32f]>; +def HasVendorXSMTVDot + : Predicate<"Subtarget->hasVendorXSMTVDot()">, + AssemblerPredicate<(all_of FeatureVendorXSMTVDot), + "'XSMTVDot' (SpacemiT Vector Dot Product Extension)">; + //===----------------------------------------------------------------------===// // LLVM specific features and extensions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index c5a706ae2b76..ce03818b4950 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16660,6 +16660,13 @@ performSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(RISCVISD::SLLW, SDLoc(N), VT, Src.getOperand(0), Src.getOperand(1)); + // Fold (sext_inreg (xor (setcc), -1), i1) -> (add (setcc), -1) + if (Opc == ISD::XOR && SrcVT == MVT::i1 && + isAllOnesConstant(Src.getOperand(1)) && + Src.getOperand(0).getOpcode() == ISD::SETCC) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, Src.getOperand(0), + DAG.getAllOnesConstant(SDLoc(N), VT)); + return SDValue(); } @@ -17518,7 +17525,7 @@ static SDValue combineOp_VLToVWOp_VL(SDNode *N, return SDValue(); SmallVector Worklist; - SmallSet Inserted; + SmallPtrSet Inserted; Worklist.push_back(N); Inserted.insert(N); SmallVector CombinesToApply; @@ -23401,6 +23408,12 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, &RISCV::VRN2M4RegClass}) { if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) return std::make_pair(0U, RC); + + if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) { + MVT ContainerVT = getContainerForFixedLengthVector(VT); + if (TRI->isTypeLegalForClass(*RC, ContainerVT)) + return std::make_pair(0U, RC); + } } } else if (Constraint == "vd") { for (const auto *RC : @@ -23414,10 +23427,24 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, &RISCV::VRN2M4NoV0RegClass}) { if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy)) return std::make_pair(0U, RC); + + if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) { + MVT ContainerVT = getContainerForFixedLengthVector(VT); + if (TRI->isTypeLegalForClass(*RC, ContainerVT)) + return std::make_pair(0U, RC); + } } } else if (Constraint == "vm") { if (TRI->isTypeLegalForClass(RISCV::VMV0RegClass, VT.SimpleTy)) return std::make_pair(0U, &RISCV::VMV0RegClass); + + if (VT.isFixedLengthVector() && useRVVForFixedLengthVectorVT(VT)) { + MVT ContainerVT = getContainerForFixedLengthVector(VT); + // VT here might be coerced to vector with i8 elements, so we need to + // check if this is a M1 register here instead of checking VMV0RegClass. + if (TRI->isTypeLegalForClass(RISCV::VRRegClass, ContainerVT)) + return std::make_pair(0U, &RISCV::VMV0RegClass); + } } else if (Constraint == "cr") { if (VT == MVT::f16 && Subtarget.hasStdExtZhinxmin()) return std::make_pair(0U, &RISCV::GPRF16CRegClass); @@ -24295,7 +24322,12 @@ bool RISCVTargetLowering::splitValueIntoRegisterParts( return true; } - if (ValueVT.isScalableVector() && PartVT.isScalableVector()) { + if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) && + PartVT.isScalableVector()) { + if (ValueVT.isFixedLengthVector()) { + ValueVT = getContainerForFixedLengthVector(ValueVT.getSimpleVT()); + Val = convertToScalableVector(ValueVT, Val, DAG, Subtarget); + } LLVMContext &Context = *DAG.getContext(); EVT ValueEltVT = ValueVT.getVectorElementType(); EVT PartEltVT = PartVT.getVectorElementType(); @@ -24365,12 +24397,17 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( return Val; } - if (ValueVT.isScalableVector() && PartVT.isScalableVector()) { + if ((ValueVT.isScalableVector() || ValueVT.isFixedLengthVector()) && + PartVT.isScalableVector()) { LLVMContext &Context = *DAG.getContext(); SDValue Val = Parts[0]; EVT ValueEltVT = ValueVT.getVectorElementType(); EVT PartEltVT = PartVT.getVectorElementType(); unsigned ValueVTBitSize = ValueVT.getSizeInBits().getKnownMinValue(); + if (ValueVT.isFixedLengthVector()) + ValueVTBitSize = getContainerForFixedLengthVector(ValueVT.getSimpleVT()) + .getSizeInBits() + .getKnownMinValue(); unsigned PartVTBitSize = PartVT.getSizeInBits().getKnownMinValue(); if (PartVTBitSize % ValueVTBitSize == 0) { assert(PartVTBitSize >= ValueVTBitSize); @@ -24388,7 +24425,10 @@ SDValue RISCVTargetLowering::joinRegisterPartsIntoValue( EVT::getVectorVT(Context, ValueEltVT, Count, /*IsScalable=*/true); Val = DAG.getNode(ISD::BITCAST, DL, SameEltTypeVT, Val); } - Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0); + if (ValueVT.isFixedLengthVector()) + Val = convertFromScalableVector(ValueVT, Val, DAG, Subtarget); + else + Val = DAG.getExtractSubvector(DL, ValueVT, Val, 0); return Val; } } diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 2a34a24a6ae2..8cbdf0ec7fa3 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -2381,6 +2381,7 @@ include "RISCVInstrInfoXqccmp.td" include "RISCVInstrInfoXMips.td" include "RISCVInstrInfoXRivos.td" include "RISCVInstrInfoXAndes.td" +include "RISCVInstrInfoXSpacemiT.td" //===----------------------------------------------------------------------===// // Global ISel diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td index e67417035963..1e22c2d35510 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td @@ -18,7 +18,26 @@ // Operand and SDNode transformation definitions. //===----------------------------------------------------------------------===// -def simm10 : RISCVSImmLeafOp<10>; +def simm10 : RISCVSImmOp<10>; + +def SImm8UnsignedAsmOperand : SImmAsmOperand<8, "Unsigned"> { + let RenderMethod = "addSImm8UnsignedOperands"; +} + +// A 8-bit signed immediate allowing range [-128, 255] +// but represented as [-128, 255]. +def simm8_unsigned : RISCVOp { + let ParserMatchClass = SImm8UnsignedAsmOperand; + let EncoderMethod = "getImmOpValue"; + let DecoderMethod = "decodeSImmOperand<8>"; + let OperandType = "OPERAND_SIMM10"; + let MCOperandPredicate = [{ + int64_t Imm; + if (!MCOp.evaluateAsConstantImm(Imm)) + return false; + return isInt<8>(Imm); + }]; +} def SImm10UnsignedAsmOperand : SImmAsmOperand<10, "Unsigned"> { let RenderMethod = "addSImm10UnsignedOperands"; @@ -43,49 +62,40 @@ def simm10_unsigned : RISCVOp { // Instruction class templates //===----------------------------------------------------------------------===// -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class PLI_i funct7, string opcodestr> - : RVInst<(outs GPR:$rd), (ins simm10:$imm10), opcodestr, "$rd, $imm10", [], +// Common base for pli.b/h/w and plui.h/w +class RVPLoadImm_i funct7, dag ins, string opcodestr, + string argstr> + : RVInst<(outs GPR:$rd), ins, opcodestr, argstr, [], InstFormatOther> { - bits<10> imm10; bits<5> rd; let Inst{31-25} = funct7; + let Inst{14-12} = 0b010; + let Inst{11-7} = rd; + let Inst{6-0} = OPC_OP_IMM_32.Value; + + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + +// Base for pli.h/w. +class PLI_i funct7, string opcodestr> + : RVPLoadImm_i { + bits<10> imm10; + let Inst{24-16} = imm10{8-0}; let Inst{15} = imm10{9}; - let Inst{14-12} = 0b010; - let Inst{11-7} = rd; - let Inst{6-0} = OPC_OP_IMM_32.Value; } -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in +// Base for plui.h/w. class PLUI_i funct7, string opcodestr> - : RVInst<(outs GPR:$rd), (ins simm10_unsigned:$imm10), opcodestr, - "$rd, $imm10", [], InstFormatOther> { + : RVPLoadImm_i { bits<10> imm10; - bits<5> rd; - let Inst{31-25} = funct7; let Inst{24} = imm10{0}; let Inst{23-15} = imm10{9-1}; - let Inst{14-12} = 0b010; - let Inst{11-7} = rd; - let Inst{6-0} = OPC_OP_IMM_32.Value; -} - -let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in -class PLI_B_i funct8, string opcodestr> - : RVInst<(outs GPR:$rd), (ins uimm8:$uimm8), opcodestr, "$rd, $uimm8", [], - InstFormatOther> { - bits<8> uimm8; - bits<5> rd; - - let Inst{31-24} = funct8; - let Inst{23-16} = uimm8; - let Inst{15} = 0b0; - let Inst{14-12} = 0b010; - let Inst{11-7} = rd; - let Inst{6-0} = OPC_OP_IMM_32.Value; } let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in @@ -161,7 +171,8 @@ class RVPBinary_rr f, bits<2> w, bits<3> funct3, string opcodestr> let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class RVPTernary_rrr f, bits<2> w, bits<3> funct3, string opcodestr> : RVInstRBase { + (ins GPR:$rd, GPR:$rs1, GPR:$rs2), opcodestr, + "$rd, $rs1, $rs2"> { let Inst{31} = 0b1; let Inst{30-27} = f; let Inst{26-25} = w; @@ -169,6 +180,24 @@ class RVPTernary_rrr f, bits<2> w, bits<3> funct3, string opcodestr> let Constraints = "$rd = $rd_wb"; } +// Common base for pli.db/h/w and plui.dh/w +class RVPPairLoadImm_i funct7, dag ins, string opcodestr, + string argstr> + : RVInst<(outs GPRPairRV32:$rd), ins, opcodestr, argstr, [], + InstFormatOther> { + bits<5> rd; + + let Inst{31-25} = funct7; + let Inst{14-12} = 0b010; + let Inst{11-8} = rd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_OP_IMM_32.Value; + + let hasSideEffects = 0; + let mayLoad = 0; + let mayStore = 0; +} + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -210,8 +239,16 @@ let Predicates = [HasStdExtP] in def PLI_H : PLI_i<0b1011000, "pli.h">; let Predicates = [HasStdExtP, IsRV64] in def PLI_W : PLI_i<0b1011001, "pli.w">; -let Predicates = [HasStdExtP] in -def PLI_B : PLI_B_i<0b10110100, "pli.b">; +let Predicates = [HasStdExtP] in { + def PLI_B : RVPLoadImm_i<0b1011010, (ins simm8_unsigned:$imm8), "pli.b", + "$rd, $imm8"> { + bits<8> imm8; + + let Inst{24} = 0b0; + let Inst{23-16} = imm8; + let Inst{15} = 0b0; + } +} let Predicates = [HasStdExtP] in { def PSEXT_H_B : RVPUnary_ri<0b00, 0b00100, "psext.h.b">; @@ -559,3 +596,30 @@ let Predicates = [HasStdExtP, IsRV64] in { def PPACKT_W : RVPBinary_rr<0b0110, 0b01, 0b100, "ppackt.w">; def PACKT_RV64 : RVPBinary_rr<0b0110, 0b11, 0b100, "packt">; } // Predicates = [HasStdExtP, IsRV64] + +let Predicates = [HasStdExtP, IsRV32] in { + def PLI_DH : RVPPairLoadImm_i<0b0011000, (ins simm10:$imm10), "pli.dh", + "$rd, $imm10"> { + bits<10> imm10; + + let Inst{24-16} = imm10{8-0}; + let Inst{15} = imm10{9}; + } + + def PLI_DB : RVPPairLoadImm_i<0b0011010, (ins simm8_unsigned:$imm8), "pli.db", + "$rd, $imm8"> { + bits<8> imm8; + + let Inst{24} = 0b0; + let Inst{23-16} = imm8; + let Inst{15} = 0b0; + } + + def PLUI_DH : RVPPairLoadImm_i<0b0111000, (ins simm10_unsigned:$imm10), + "plui.dh", "$rd, $imm10"> { + bits<10> imm10; + + let Inst{24} = imm10{0}; + let Inst{23-15} = imm10{9-1}; + } +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td index c75addd95b14..1fb30a0b73d9 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXAndes.td @@ -420,7 +420,7 @@ class NDSRVInstVD4DOT funct6, string opcodestr> } class NDSRVInstVBFHCvt vs1, string opcodestr> - : RVInst<(outs VR:$vd), (ins VR:$vs2, VMaskOp:$vm), + : RVInst<(outs VR:$vd), (ins VR:$vs2), opcodestr, "$vd, $vs2", [], InstFormatR> { bits<5> vs2; bits<5> vd; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td index 0c8487c2f5db..889ea9802257 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXMips.td @@ -129,20 +129,20 @@ class Mips_prefetch_ri // MIPS extensions //===----------------------------------------------------------------------===// let Predicates = [HasVendorXMIPSCBOP] ,DecoderNamespace = "Xmipscbop" in { - def MIPS_PREFETCH : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint), - "mips.pref", "$hint, ${imm9}(${rs1})">, - Sched<[]>; + def MIPS_PREF : Mips_prefetch_ri<(outs), (ins GPR:$rs1, uimm9:$imm9, uimm5:$hint), + "mips.pref", "$hint, ${imm9}(${rs1})">, + Sched<[]>; } let Predicates = [HasVendorXMIPSCBOP] in { // Prefetch Data Write. def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9), (i32 1), timm, (i32 1)), - (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 9)>; + (MIPS_PREF GPR:$rs1, uimm9:$imm9, 9)>; // Prefetch Data Read. def : Pat<(prefetch (AddrRegImm9 (XLenVT GPR:$rs1), uimm9:$imm9), (i32 0), timm, (i32 1)), - (MIPS_PREFETCH GPR:$rs1, uimm9:$imm9, 8)>; + (MIPS_PREF GPR:$rs1, uimm9:$imm9, 8)>; } let Predicates = [HasVendorXMIPSCMov], hasSideEffects = 0, mayLoad = 0, mayStore = 0, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td new file mode 100644 index 000000000000..980931ea12a4 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSpacemiT.td @@ -0,0 +1,139 @@ +//===-- RISCVInstrInfoXSpacemiT.td -------------------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file describes the vendor extensions defined by SpacemiT. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Operand definitions. +//===----------------------------------------------------------------------===// + +class SMTVDotOpcode val> { + bits<7> Value = val; +} + +class SMTVEncoding2 val> { + bits<2> Value = val; +} + +def OPMMA : SMTVDotOpcode<0b1110001>; +def OPMMA_SLIDE : SMTVDotOpcode<0b1110011>; + +//===----------------------------------------------------------------------===// +// Vector Dot-Product Sign Encoding +// Defines the signed/unsigned mixing modes for vector dot-product operations. +// Encoding format: [1:0] bits +// 00: UU (Unsigned x Unsigned) +// 01: US (Unsigned x Signed) +// 10: SU (Signed x Unsigned) +// 11: SS (Signed x Signed) +//===----------------------------------------------------------------------===// +def SMT_VDot_UU : SMTVEncoding2<0b00>; +def SMT_VDot_US : SMTVEncoding2<0b01>; +def SMT_VDot_SU : SMTVEncoding2<0b10>; +def SMT_VDot_SS : SMTVEncoding2<0b11>; + +//===----------------------------------------------------------------------===// +// Vector Dot-Product Sliding Window Modes +// Encoding format: [1:0] bits +// 00: Slide1 (1-element sliding stride) +// 01: Slide2 (2-element sliding stride) +// 10: Slide3 (3-element sliding stride) +// 11: Reserved +// +// Used in sliding-window dot-product operations: +// vd = vs1 • vs2.slide{1|2|3} // • = dot product +//===----------------------------------------------------------------------===// +def SMT_VDot_Slide1 : SMTVEncoding2<0b00>; +def SMT_VDot_Slide2 : SMTVEncoding2<0b01>; +def SMT_VDot_Slide3 : SMTVEncoding2<0b10>; + +//===----------------------------------------------------------------------===// +// Instruction formats +//===----------------------------------------------------------------------===// + +let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in { +// Base vector dot product (no slide) format. +class RVInstSMTVDot + : RVInst<(outs VRM2:$vd), (ins VR:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> { + bits<5> vd; + bits<5> vs1; + bits<5> vs2; + + let Inst{31-25} = OPMMA.Value; + let Inst{24-20} = vs2; + let Inst{19-15} = vs1; + let Inst{14} = 0b0; + let Inst{13-12} = sign.Value; + let Inst{11-8} = vd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_CUSTOM_1.Value; +} + +// Sliding-window vector dot product format. +class RVInstSMTVDotSlide + : RVInst<(outs VRM2:$vd), (ins VRM2:$vs1, VR:$vs2), opcodestr, argstr, [], InstFormatR> { + bits<5> vd; + bits<5> vs1; + bits<5> vs2; + + let Inst{31-25} = OPMMA_SLIDE.Value; + let Inst{24-20} = vs2; + let Inst{19-16} = vs1{4-1}; + let Inst{15-14} = funct2.Value; + let Inst{13-12} = sign.Value; + let Inst{11-8} = vd{4-1}; + let Inst{7} = 0b0; + let Inst{6-0} = OPC_CUSTOM_1.Value; +} +} + +//===----------------------------------------------------------------------===// +// Instructions +//===----------------------------------------------------------------------===// + +let DecoderNamespace = "XSMT" in { + +let Predicates = [HasVendorXSMTVDot], ElementsDependOn = EltDepsVL in { +// Base vector dot product (no slide) instructions +// NOTE: Destination registers (vd) MUST be even-numbered (v0, v2, ..., v30) +// due to hardware alignment constraints. Using odd registers may cause undefined behavior. +def VMADOT : RVInstSMTVDot; +def VMADOTU : RVInstSMTVDot; +def VMADOTSU : RVInstSMTVDot; +def VMADOTUS : RVInstSMTVDot; + +//===----------------------------------------------------------------------===// +// Sliding-window Vector Dot Product Instructions +// +// The numeric suffix (1, 2, 3) specifies the stride of the sliding window: +// 1: Window slides by 1 element per operation +// 2: Window slides by 2 elements per operation +// 3: Window slides by 3 elements per operation +// +// These instructions compute dot products with overlapping operand windows +// where the window position increments by elements between computations. +//===----------------------------------------------------------------------===// +// NOTE: Destination registers (vd) and first source register (vs1) MUST be +// even-numbered (v0, v2, ..., v30) due to hardware alignment constraints. +// Using odd registers may cause undefined behavior. +def VMADOT1 : RVInstSMTVDotSlide; +def VMADOT1U : RVInstSMTVDotSlide; +def VMADOT1SU : RVInstSMTVDotSlide; +def VMADOT1US : RVInstSMTVDotSlide; +def VMADOT2 : RVInstSMTVDotSlide; +def VMADOT2U : RVInstSMTVDotSlide; +def VMADOT2SU : RVInstSMTVDotSlide; +def VMADOT2US : RVInstSMTVDotSlide; +def VMADOT3 : RVInstSMTVDotSlide; +def VMADOT3U : RVInstSMTVDotSlide; +def VMADOT3SU : RVInstSMTVDotSlide; +def VMADOT3US : RVInstSMTVDotSlide; +} +} \ No newline at end of file diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td index 31d2b3a10db5..f89d94f41b69 100644 --- a/llvm/lib/Target/RISCV/RISCVProcessors.td +++ b/llvm/lib/Target/RISCV/RISCVProcessors.td @@ -673,6 +673,7 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60", FeatureStdExtZvfh, FeatureStdExtZvkt, FeatureStdExtZvl256b, + FeatureVendorXSMTVDot, FeatureUnalignedScalarMem]), [TuneDLenFactor2, TuneOptimizedNF2SegmentLoadStore, diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index 7e58b6f34268..8a3c8e2a1c1c 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -589,7 +589,7 @@ bool RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, (Lo12 & 0b11111) != 0) { // Prefetch instructions require the offset to be 32 byte aligned. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); - } else if (Opc == RISCV::MIPS_PREFETCH && !isUInt<9>(Val)) { + } else if (Opc == RISCV::MIPS_PREF && !isUInt<9>(Val)) { // MIPS Prefetch instructions require the offset to be 9 bits encoded. MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0); } else if ((Opc == RISCV::PseudoRV32ZdinxLD || diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp index 66ce13428267..c70571c1dacb 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -38,7 +38,6 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include using namespace llvm; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index af78b3cc2c7f..85b3059d87da 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1431,7 +1431,7 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, case Intrinsic::ctlz: case Intrinsic::ctpop: { auto LT = getTypeLegalizationCost(RetTy); - if (ST->hasVInstructions() && ST->hasStdExtZvbb() && LT.second.isVector()) { + if (ST->hasStdExtZvbb() && LT.second.isVector()) { unsigned Op; switch (ICA.getID()) { case Intrinsic::cttz: diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp index 0398e5289579..aea3397ad2fd 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizeImplicitBinding.cpp @@ -15,7 +15,6 @@ #include "SPIRV.h" #include "llvm/ADT/BitVector.h" -#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp index 5ee66e3dfa7a..dcefff99db25 100644 --- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp @@ -20,7 +20,6 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Module.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 9b03e85ca45b..28495e7c5719 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -151,17 +151,17 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer, SmallVector ModuleFlags; M.getModuleFlagsMetadata(ModuleFlags); - MDNode *CFGProfile = nullptr; + MDNode *CGProfile = nullptr; for (const auto &MFE : ModuleFlags) { StringRef Key = MFE.Key->getString(); if (Key == "CG Profile") { - CFGProfile = cast(MFE.Val); + CGProfile = cast(MFE.Val); break; } } - if (!CFGProfile) + if (!CGProfile) return; auto GetSym = [this](const MDOperand &MDO) -> MCSymbol * { @@ -174,7 +174,7 @@ void TargetLoweringObjectFile::emitCGProfileMetadata(MCStreamer &Streamer, return TM->getSymbol(F); }; - for (const auto &Edge : CFGProfile->operands()) { + for (const auto &Edge : CGProfile->operands()) { MDNode *E = cast(Edge); const MCSymbol *From = GetSym(E->getOperand(0)); const MCSymbol *To = GetSym(E->getOperand(1)); diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 80df4ed2563b..45bbf128ce0b 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -220,7 +220,6 @@ static MCSymbolWasm *getOrCreateFunctionTableSymbol(MCContext &Ctx, Sym = static_cast(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(Is64); // The default function table is synthesized by the linker. - Sym->setUndefined(); } return Sym; } diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp index 2a398d4e6333..fa6086c7db07 100644 --- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp +++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp @@ -26,7 +26,6 @@ #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolWasm.h" #include "llvm/MC/TargetRegistry.h" -#include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" #include "llvm/Support/LEB128.h" diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 8baca2ad3133..35d5c3ed90c9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -3386,8 +3386,56 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) { return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0)); } +/// Try to convert a i128 comparison to a v16i8 comparison before type +/// legalization splits it up into chunks +static SDValue +combineVectorSizedSetCCEquality(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + const WebAssemblySubtarget *Subtarget) { + + SDLoc DL(N); + SDValue X = N->getOperand(0); + SDValue Y = N->getOperand(1); + EVT VT = N->getValueType(0); + EVT OpVT = X.getValueType(); + + SelectionDAG &DAG = DCI.DAG; + if (DCI.DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat)) + return SDValue(); + + ISD::CondCode CC = cast(N->getOperand(2))->get(); + // We're looking for an oversized integer equality comparison with SIMD + if (!OpVT.isScalarInteger() || !OpVT.isByteSized() || OpVT != MVT::i128 || + !Subtarget->hasSIMD128() || !isIntEqualitySetCC(CC)) + return SDValue(); + + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa(X) || X.getOpcode() == ISD::LOAD; + }; + + if (!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) + return SDValue(); + + SDValue VecX = DAG.getBitcast(MVT::v16i8, X); + SDValue VecY = DAG.getBitcast(MVT::v16i8, Y); + SDValue Cmp = DAG.getSetCC(DL, MVT::v16i8, VecX, VecY, CC); + + SDValue Intr = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, + {DAG.getConstant(CC == ISD::SETEQ ? Intrinsic::wasm_alltrue + : Intrinsic::wasm_anytrue, + DL, MVT::i32), + Cmp}); + + return DAG.getSetCC(DL, VT, Intr, DAG.getConstant(0, DL, MVT::i32), + ISD::SETNE); +} + static SDValue performSETCCCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const WebAssemblySubtarget *Subtarget) { if (!DCI.isBeforeLegalize()) return SDValue(); @@ -3395,6 +3443,9 @@ static SDValue performSETCCCombine(SDNode *N, if (!VT.isScalarInteger()) return SDValue(); + if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget)) + return V; + SDValue LHS = N->getOperand(0); if (LHS->getOpcode() != ISD::BITCAST) return SDValue(); @@ -3574,7 +3625,7 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, case ISD::BITCAST: return performBitcastCombine(N, DCI); case ISD::SETCC: - return performSETCCCombine(N, DCI); + return performSETCCCombine(N, DCI, Subtarget); case ISD::VECTOR_SHUFFLE: return performVECTOR_SHUFFLECombine(N, DCI); case ISD::SIGN_EXTEND: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp index 52e706514226..08fb7586d215 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp @@ -147,7 +147,8 @@ WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { Options.AllowOverlappingLoads = true; - // TODO: Teach WebAssembly backend about load v128. + if (ST->hasSIMD128()) + Options.LoadSizes.push_back(16); Options.LoadSizes.append({8, 4, 2, 1}); Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 42d1271f29c4..890486778e70 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -113,7 +113,6 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( Sym = static_cast(Ctx.getOrCreateSymbol(Name)); Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. - Sym->setUndefined(); } // MVP object files can't have symtab entries for tables. if (!(Subtarget && Subtarget->hasCallIndirectOverlong())) diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 817e88d8a0bc..e2a1bbf383b3 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr &Inst, } } +void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, + const MCInst &MCI) { + // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we + // have not done the necessary benchmarking to see if they are also + // optimized by the stack engine. + // TODO: We currently just remove all RSP writes from stack operations. This + // is not fully correct because we do not model sync uops which will + // delay subsequent rsp using non-stack instructions. + if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { + auto *StackRegisterDef = + llvm::find_if(Inst->getDefs(), [](const WriteState &State) { + return State.getRegisterID() == X86::RSP; + }); + assert( + StackRegisterDef != Inst->getDefs().end() && + "Expected push instruction to implicitly use stack pointer register."); + Inst->getDefs().erase(StackRegisterDef); + } +} + void X86InstrPostProcess::postProcessInstruction( std::unique_ptr &Inst, const MCInst &MCI) { - // Currently, we only modify certain instructions' IsALoadBarrier and - // IsAStoreBarrier flags. + // Set IsALoadBarrier and IsAStoreBarrier flags. setMemBarriers(Inst, MCI); + useStackEngine(Inst, MCI); } } // namespace mca diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 4a83ba848dd8..c5459e42dfc9 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess { /// as load and store barriers. void setMemBarriers(std::unique_ptr &Inst, const MCInst &MCI); + /// Called within X86InstrPostPorcess to remove some rsp read operands + /// on stack instructions to better simulate the stack engine. We currently + /// do not model features of the stack engine like sync uops. + void useStackEngine(std::unique_ptr &Inst, const MCInst &MCI); + public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 990b381341f0..3d34ea3bed31 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -1291,7 +1291,9 @@ def ProcessorFeatures { list ADLAdditionalTuning = [TuningPERMFalseDeps, TuningPreferMovmskOverVTest, TuningFastImmVectorShift]; - list ADLTuning = !listconcat(SKLTuning, ADLAdditionalTuning); + list ADLRemoveTuning = [TuningPOPCNTFalseDeps]; + list ADLTuning = + !listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning); list ADLFeatures = !listconcat(TRMFeatures, ADLAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7a816de53dbd..52e0bb8a9b83 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2756,8 +2756,10 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const { !Subtarget.hasBWI()) return TypeSplitVector; + // Since v8f16 is legal, widen anything over v4f16. if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && - !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) + VT.getVectorNumElements() <= 4 && !Subtarget.hasF16C() && + VT.getVectorElementType() == MVT::f16) return TypeSplitVector; if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp index cf055cf3be0a..090060eaa65e 100644 --- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp @@ -491,7 +491,7 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph( NumGadgets += GadgetCount; // Traverse CFG to build the rest of the graph - SmallSet BlocksVisited; + SmallPtrSet BlocksVisited; std::function TraverseCFG = [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) { unsigned LoopDepth = MLI.getLoopDepth(MBB); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 3b4e531f2538..2a1c49957bf7 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -100,7 +100,7 @@ struct BBInfo { class X86PreTileConfig : public MachineFunctionPass { MachineRegisterInfo *MRI = nullptr; const MachineLoopInfo *MLI = nullptr; - SmallSet DefVisited; + SmallPtrSet DefVisited; DenseMap BBVisitedInfo; DenseMap> ShapeBBs; diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index ef917f72f39c..595ad3290eed 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" -#include "MCTargetDesc/X86BaseInfo.h" #include "X86FrameLowering.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" diff --git a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp index 7640d7090949..7fa77ee8204a 100644 --- a/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp +++ b/llvm/lib/Target/X86/X86WinEHUnwindV2.cpp @@ -235,7 +235,7 @@ bool X86WinEHUnwindV2::runOnMachineFunction(MachineFunction &MF) { MF, Mode, "The epilog is popping more registers than the prolog " "pushed"); - if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg) + if (PushedRegs[PushedRegs.size() - PoppedRegCount] != Reg.id()) return rejectCurrentFunctionInternalError( MF, Mode, "The epilog is popping a registers in " diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp index 6ed3b62872dd..042578d26818 100644 --- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -53,7 +53,6 @@ #include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 8262c8c3a90f..44394f6deb9a 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -273,7 +273,7 @@ MemoryEffects llvm::computeFunctionBodyMemoryAccess(Function &F, /// Deduce readonly/readnone/writeonly attributes for the SCC. template static void addMemoryAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter, - SmallSet &Changed) { + SmallPtrSet &Changed) { MemoryEffects ME = MemoryEffects::none(); MemoryEffects RecursiveArgME = MemoryEffects::none(); for (Function *F : SCCNodes) { @@ -1002,7 +1002,7 @@ determinePointerAccessAttrs(Argument *A, /// Deduce returned attributes for the SCC. static void addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { // Check each function in turn, determining if an argument is always returned. for (Function *F : SCCNodes) { // We can infer and propagate function attributes only when we know that the @@ -1238,7 +1238,7 @@ static bool inferInitializes(Argument &A, Function &F) { /// Deduce nocapture attributes for the SCC. static void addArgumentAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed, + SmallPtrSet &Changed, bool SkipInitializes) { ArgumentGraph AG; @@ -1510,7 +1510,7 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) { /// Deduce noalias attributes for the SCC. static void addNoAliasAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { // Check each function in turn, determining which functions return noalias // pointers. for (Function *F : SCCNodes) { @@ -1623,7 +1623,7 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes, /// Deduce nonnull attributes for the SCC. static void addNonNullAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { // Speculative that all functions in the SCC return only nonnull // pointers. We may refute this as we analyze functions. bool SCCReturnsNonNull = true; @@ -1680,7 +1680,7 @@ static void addNonNullAttrs(const SCCNodeSet &SCCNodes, /// Deduce noundef attributes for the SCC. static void addNoUndefAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { // Check each function in turn, determining which functions return noundef // values. for (Function *F : SCCNodes) { @@ -1788,13 +1788,13 @@ public: InferenceDescriptors.push_back(AttrInference); } - void run(const SCCNodeSet &SCCNodes, SmallSet &Changed); + void run(const SCCNodeSet &SCCNodes, SmallPtrSet &Changed); }; /// Perform all the requested attribute inference actions according to the /// attribute predicates stored before. void AttributeInferer::run(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { SmallVector InferInSCC = InferenceDescriptors; // Go through all the functions in SCC and check corresponding attribute // assumptions for each of them. Attributes that are invalid for this SCC @@ -1969,7 +1969,7 @@ static bool InstrBreaksNoSync(Instruction &I, const SCCNodeSet &SCCNodes) { /// /// Returns true if any changes to function attributes were made. static void inferConvergent(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { AttributeInferer AI; // Request to remove the convergent attribute from all functions in the SCC @@ -2000,7 +2000,7 @@ static void inferConvergent(const SCCNodeSet &SCCNodes, /// /// Returns true if any changes to function attributes were made. static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { AttributeInferer AI; if (!DisableNoUnwindInference) @@ -2069,7 +2069,7 @@ static void inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes, } static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { // Try and identify functions that do not recurse. // If the SCC contains multiple nodes we know for sure there is recursion. @@ -2105,7 +2105,7 @@ static void addNoRecurseAttrs(const SCCNodeSet &SCCNodes, // Set the noreturn function attribute if possible. static void addNoReturnAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { for (Function *F : SCCNodes) { if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || F->doesNotReturn()) @@ -2166,7 +2166,7 @@ static bool allPathsGoThroughCold(Function &F) { // Set the cold function attribute if possible. static void addColdAttrs(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { for (Function *F : SCCNodes) { if (!F || !F->hasExactDefinition() || F->hasFnAttribute(Attribute::Naked) || F->hasFnAttribute(Attribute::Cold) || F->hasFnAttribute(Attribute::Hot)) @@ -2213,7 +2213,7 @@ static bool functionWillReturn(const Function &F) { // Set the willreturn function attribute if possible. static void addWillReturn(const SCCNodeSet &SCCNodes, - SmallSet &Changed) { + SmallPtrSet &Changed) { for (Function *F : SCCNodes) { if (!F || F->willReturn() || !functionWillReturn(*F)) continue; @@ -2239,7 +2239,7 @@ static SCCNodesResult createSCCNodeSet(ArrayRef Functions) { } template -static SmallSet +static SmallPtrSet deriveAttrsInPostOrder(ArrayRef Functions, AARGetterT &&AARGetter, bool ArgAttrsOnly) { SCCNodesResult Nodes = createSCCNodeSet(Functions); @@ -2248,7 +2248,7 @@ deriveAttrsInPostOrder(ArrayRef Functions, AARGetterT &&AARGetter, if (Nodes.SCCNodes.empty()) return {}; - SmallSet Changed; + SmallPtrSet Changed; if (ArgAttrsOnly) { // ArgAttrsOnly means to only infer attributes that may aid optimizations // on the *current* function. "initializes" attribute is to aid diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp index c876a47ef212..9196a0147c43 100644 --- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp +++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp @@ -16,7 +16,6 @@ #include "llvm/Analysis/ValueLattice.h" #include "llvm/Analysis/ValueLatticeUtils.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IntrinsicInst.h" #include "llvm/Transforms/Scalar/SCCP.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/SCCPSolver.h" diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index bdda4980c100..d7edd1288309 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1133,9 +1133,6 @@ static bool optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal, const DataLayout &DL, function_ref GetTLI) { - // Ignore no-op GEPs and bitcasts. - StoredOnceVal = StoredOnceVal->stripPointerCasts(); - // If we are dealing with a pointer global that is initialized to null and // only has one (non-null) value stored into it, then we can optimize any // users of the loaded value (often calls and loads) that would trap if the diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 21bd4164385a..6b394f533868 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -3237,6 +3237,8 @@ struct MemorySanitizerVisitor : public InstVisitor { /// /// TODO: "horizontal"/"pairwise" intrinsics are often incorrectly matched by /// by this handler. See horizontalReduce(). + /// + /// TODO: permutation intrinsics are also often incorrectly matched. [[maybe_unused]] bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I, unsigned int trailingFlags) { @@ -3641,9 +3643,10 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - // Get an MMX-sized vector type. - Type *getMMXVectorTy(unsigned EltSizeInBits) { - const unsigned X86_MMXSizeInBits = 64; + // Get an MMX-sized (64-bit) vector type, or optionally, other sized + // vectors. + Type *getMMXVectorTy(unsigned EltSizeInBits, + unsigned X86_MMXSizeInBits = 64) { assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 && "Illegal MMX vector element size"); return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits), @@ -3843,20 +3846,109 @@ struct MemorySanitizerVisitor : public InstVisitor { setOriginForNaryOp(I); } - // Instrument multiply-add intrinsic. - void handleVectorPmaddIntrinsic(IntrinsicInst &I, - unsigned MMXEltSizeInBits = 0) { - Type *ResTy = - MMXEltSizeInBits ? getMMXVectorTy(MMXEltSizeInBits * 2) : I.getType(); + // Instrument multiply-add intrinsics. + // + // e.g., Two operands: + // <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a, <8 x i16> %b) + // + // Two operands which require an EltSizeInBits override: + // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> %a, <1 x i64> %b) + // + // Three operands are not implemented yet: + // <4 x i32> @llvm.x86.avx512.vpdpbusd.128 + // (<4 x i32> %s, <4 x i32> %a, <4 x i32> %b) + // (the result of multiply-add'ing %a and %b is accumulated with %s) + void handleVectorPmaddIntrinsic(IntrinsicInst &I, unsigned ReductionFactor, + unsigned EltSizeInBits = 0) { IRBuilder<> IRB(&I); - auto *Shadow0 = getShadow(&I, 0); - auto *Shadow1 = getShadow(&I, 1); - Value *S = IRB.CreateOr(Shadow0, Shadow1); - S = IRB.CreateBitCast(S, ResTy); - S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)), - ResTy); - S = IRB.CreateBitCast(S, getShadowTy(&I)); - setShadow(&I, S); + + [[maybe_unused]] FixedVectorType *ReturnType = + cast(I.getType()); + assert(isa(ReturnType)); + + assert(I.arg_size() == 2); + + // Vectors A and B, and shadows + Value *Va = I.getOperand(0); + Value *Vb = I.getOperand(1); + + Value *Sa = getShadow(&I, 0); + Value *Sb = getShadow(&I, 1); + + FixedVectorType *ParamType = + cast(I.getArgOperand(0)->getType()); + assert(ParamType == I.getArgOperand(1)->getType()); + + assert(ParamType->getPrimitiveSizeInBits() == + ReturnType->getPrimitiveSizeInBits()); + + FixedVectorType *ImplicitReturnType = ReturnType; + // Step 1: instrument multiplication of corresponding vector elements + if (EltSizeInBits) { + ImplicitReturnType = cast(getMMXVectorTy( + EltSizeInBits * 2, ParamType->getPrimitiveSizeInBits())); + ParamType = cast( + getMMXVectorTy(EltSizeInBits, ParamType->getPrimitiveSizeInBits())); + + Va = IRB.CreateBitCast(Va, ParamType); + Vb = IRB.CreateBitCast(Vb, ParamType); + + Sa = IRB.CreateBitCast(Sa, getShadowTy(ParamType)); + Sb = IRB.CreateBitCast(Sb, getShadowTy(ParamType)); + } else { + assert(ParamType->getNumElements() == + ReturnType->getNumElements() * ReductionFactor); + } + + // Multiplying an *initialized* zero by an uninitialized element results in + // an initialized zero element. + // + // This is analogous to bitwise AND, where "AND" of 0 and a poisoned value + // results in an unpoisoned value. We can therefore adapt the visitAnd() + // instrumentation: + // OutShadow = (SaNonZero & SbNonZero) + // | (VaNonZero & SbNonZero) + // | (SaNonZero & VbNonZero) + // where non-zero is checked on a per-element basis (not per bit). + Value *SZero = Constant::getNullValue(Va->getType()); + Value *VZero = Constant::getNullValue(Sa->getType()); + Value *SaNonZero = IRB.CreateICmpNE(Sa, SZero); + Value *SbNonZero = IRB.CreateICmpNE(Sb, SZero); + Value *VaNonZero = IRB.CreateICmpNE(Va, VZero); + Value *VbNonZero = IRB.CreateICmpNE(Vb, VZero); + + Value *SaAndSbNonZero = IRB.CreateAnd(SaNonZero, SbNonZero); + Value *VaAndSbNonZero = IRB.CreateAnd(VaNonZero, SbNonZero); + Value *SaAndVbNonZero = IRB.CreateAnd(SaNonZero, VbNonZero); + + // Each element of the vector is represented by a single bit (poisoned or + // not) e.g., <8 x i1>. + Value *And = IRB.CreateOr({SaAndSbNonZero, VaAndSbNonZero, SaAndVbNonZero}); + + // Extend <8 x i1> to <8 x i16>. + // (The real pmadd intrinsic would have computed intermediate values of + // <8 x i32>, but that is irrelevant for our shadow purposes because we + // consider each element to be either fully initialized or fully + // uninitialized.) + And = IRB.CreateSExt(And, Sa->getType()); + + // Step 2: instrument horizontal add + // We don't need bit-precise horizontalReduce because we only want to check + // if each pair of elements is fully zero. + // Cast to <4 x i32>. + Value *Horizontal = IRB.CreateBitCast(And, ImplicitReturnType); + + // Compute <4 x i1>, then extend back to <4 x i32>. + Value *OutShadow = IRB.CreateSExt( + IRB.CreateICmpNE(Horizontal, + Constant::getNullValue(Horizontal->getType())), + ImplicitReturnType); + + // For MMX, cast it back to the required fake return type (<1 x i64>). + if (EltSizeInBits) + OutShadow = CreateShadowCast(IRB, OutShadow, getShadowTy(&I)); + + setShadow(&I, OutShadow); setOriginForNaryOp(I); } @@ -5391,19 +5483,28 @@ struct MemorySanitizerVisitor : public InstVisitor { handleVectorSadIntrinsic(I); break; + // Multiply and Add Packed Words + // < 4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) + // < 8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) + // + // Multiply and Add Packed Signed and Unsigned Bytes + // < 8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) + // <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) case Intrinsic::x86_sse2_pmadd_wd: case Intrinsic::x86_avx2_pmadd_wd: case Intrinsic::x86_ssse3_pmadd_ub_sw_128: case Intrinsic::x86_avx2_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2); break; + // <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64>, <1 x i64>) case Intrinsic::x86_ssse3_pmadd_ub_sw: - handleVectorPmaddIntrinsic(I, 8); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/8); break; + // <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64>, <1 x i64>) case Intrinsic::x86_mmx_pmadd_wd: - handleVectorPmaddIntrinsic(I, 16); + handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2, /*EltSize=*/16); break; case Intrinsic::x86_sse_cmp_ss: @@ -5620,6 +5721,26 @@ struct MemorySanitizerVisitor : public InstVisitor { handleAVXVpermi2var(I); break; + // Packed Shuffle + // llvm.x86.sse.pshuf.w(<1 x i64>, i8) + // llvm.x86.ssse3.pshuf.b(<1 x i64>, <1 x i64>) + // llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) + // llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) + // llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) + // + // The following intrinsics are auto-upgraded: + // llvm.x86.sse2.pshuf.d(<4 x i32>, i8) + // llvm.x86.sse2.gpshufh.w(<8 x i16>, i8) + // llvm.x86.sse2.pshufl.w(<8 x i16>, i8) + case Intrinsic::x86_avx2_pshuf_b: + case Intrinsic::x86_sse_pshuf_w: + case Intrinsic::x86_ssse3_pshuf_b_128: + case Intrinsic::x86_ssse3_pshuf_b: + case Intrinsic::x86_avx512_pshuf_b_512: + handleIntrinsicByApplyingToShadow(I, I.getIntrinsicID(), + /*trailingVerbatimArgs=*/1); + break; + case Intrinsic::x86_avx512_mask_cvtps2dq_512: { handleAVX512VectorConvertFPToInt(I); break; diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index 938aab587904..ac59ae182896 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -447,7 +447,7 @@ private: /// Also, collect select instructions to unfold. bool isCandidate(const SwitchInst *SI) { std::deque> Q; - SmallSet SeenValues; + SmallPtrSet SeenValues; SelectInsts.clear(); Value *SICond = SI->getCondition(); @@ -511,7 +511,7 @@ private: void addToQueue(Value *Val, BasicBlock *BB, std::deque> &Q, - SmallSet &SeenValues) { + SmallPtrSet &SeenValues) { if (SeenValues.insert(Val).second) Q.push_back({Val, BB}); } @@ -713,7 +713,7 @@ private: // Some blocks have multiple edges to the same successor, and this set // is used to prevent a duplicate path from being generated - SmallSet Successors; + SmallPtrSet Successors; for (BasicBlock *Succ : successors(BB)) { if (!Successors.insert(Succ).second) continue; @@ -762,7 +762,7 @@ private: SmallVector Stack; Stack.push_back(FirstDef); - SmallSet SeenValues; + SmallPtrSet SeenValues; while (!Stack.empty()) { PHINode *CurPhi = Stack.pop_back_val(); @@ -955,7 +955,7 @@ private: DuplicateBlockMap DuplicateMap; DefMap NewDefs; - SmallSet BlocksToClean; + SmallPtrSet BlocksToClean; BlocksToClean.insert_range(successors(SwitchBlock)); for (ThreadingPath &TPath : SwitchPaths->getThreadingPaths()) { @@ -984,7 +984,7 @@ private: /// the predecessors, and phis in the successor blocks. void createExitPath(DefMap &NewDefs, ThreadingPath &Path, DuplicateBlockMap &DuplicateMap, - SmallSet &BlocksToClean, + SmallPtrSet &BlocksToClean, DomTreeUpdater *DTU) { APInt NextState = Path.getExitValue(); const BasicBlock *Determinator = Path.getDeterminatorBB(); diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 7704e49c499d..4baa3b3eb824 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -978,7 +978,7 @@ static bool IsValueFullyAvailableInBlock( unsigned NumNewNewSpeculativelyAvailableBBs = 0; #ifndef NDEBUG - SmallSet NewSpeculativelyAvailableBBs; + SmallPtrSet NewSpeculativelyAvailableBBs; SmallVector AvailableBBs; #endif @@ -1222,7 +1222,7 @@ static bool liesBetween(const Instruction *From, Instruction *Between, const Instruction *To, const DominatorTree *DT) { if (From->getParent() == Between->getParent()) return DT->dominates(From, Between); - SmallSet Exclusion; + SmallPtrSet Exclusion; Exclusion.insert(Between->getParent()); return !isPotentiallyReachable(From, To, &Exclusion, DT); } diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp index 3ba5b79293bc..d99f1eb9c93c 100644 --- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -642,9 +642,9 @@ Value *GuardWideningImpl::freezeAndPush(Value *Orig, return FI; } - SmallSet Visited; + SmallPtrSet Visited; SmallVector Worklist; - SmallSet DropPoisonFlags; + SmallPtrSet DropPoisonFlags; SmallVector NeedFreeze; DenseMap CacheOfFreezes; diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 334c911191cb..6720cb1ef899 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1613,7 +1613,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) { if (CurrMaxExit == MaxBECount) SkipLastIter = true; }; - SmallSet DominatingExactExitCounts; + SmallPtrSet DominatingExactExitCounts; for (BasicBlock *ExitingBB : ExitingBlocks) { const SCEV *ExactExitCount = SE->getExitCount(L, ExitingBB); const SCEV *MaxExitCount = SE->getExitCount( diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 4c035a2464c8..8d61779a428e 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -1699,8 +1699,12 @@ static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning // time in isGuaranteedToExecute if we don't actually have anything to // drop. It is a compile time optimization, not required for correctness. - !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) - I.dropUBImplyingAttrsAndMetadata(); + !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop)) { + if (ProfcheckDisableMetadataFixes) + I.dropUBImplyingAttrsAndMetadata(); + else + I.dropUBImplyingAttrsAndMetadata({LLVMContext::MD_prof}); + } if (isa(I)) // Move the new node to the end of the phi list in the destination block. diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index c68149b78080..5795c761b3be 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1209,7 +1209,7 @@ public: // // For verification, we keep track of where we changed uses to poison in // PoisonedInsts and then check that we in fact remove them. - SmallSet PoisonedInsts; + SmallPtrSet PoisonedInsts; for (auto *Inst : reverse(ToRemove)) { for (Use &U : llvm::make_early_inc_range(Inst->uses())) { if (auto *Poisoned = dyn_cast(U.getUser())) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index f237322f9045..e043d072a763 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1530,7 +1530,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, // to remove them. SmallVector LifetimeMarkers; - SmallSet AAMetadataInstrs; + SmallPtrSet AAMetadataInstrs; bool SrcNotDom = false; auto CaptureTrackingWithModRef = @@ -1540,7 +1540,7 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, Worklist.push_back(AI); unsigned MaxUsesToExplore = getDefaultMaxUsesToExploreForCaptureTracking(); Worklist.reserve(MaxUsesToExplore); - SmallSet Visited; + SmallPtrSet Visited; while (!Worklist.empty()) { Instruction *I = Worklist.pop_back_val(); for (const Use &U : I->uses()) { diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp index 343da5b2e470..ba58b8e4eda5 100644 --- a/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -878,7 +878,7 @@ static Value *NegateValue(Value *V, Instruction *BI, // only that it mostly looks like one. static bool isLoadCombineCandidate(Instruction *Or) { SmallVector Worklist; - SmallSet Visited; + SmallPtrSet Visited; auto Enqueue = [&](Value *V) { auto *I = dyn_cast(V); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index f6959ca209fd..9b40fc03da6b 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -2144,23 +2144,9 @@ void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) { void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, bool CurrentLoopValid, bool PartiallyInvariant, bool InjectedCondition, ArrayRef NewLoops) { - auto RecordLoopAsUnswitched = [&](Loop *TargetLoop, StringRef Tag, - StringRef DisableTag) { - auto &Ctx = TargetLoop->getHeader()->getContext(); - MDNode *DisableMD = MDNode::get(Ctx, MDString::get(Ctx, DisableTag)); - MDNode *NewLoopID = makePostTransformationMetadata( - Ctx, TargetLoop->getLoopID(), {Tag}, {DisableMD}); - TargetLoop->setLoopID(NewLoopID); - }; - - // If we performed a non-trivial unswitch, we have added new cloned loops. - // Mark such newly-created loops as visited. - if (!NewLoops.empty()) { - for (Loop *NL : NewLoops) - RecordLoopAsUnswitched(NL, "llvm.loop.unswitch.nontrivial", - "llvm.loop.unswitch.nontrivial.disable"); + // If we did a non-trivial unswitch, we have added new (cloned) loops. + if (!NewLoops.empty()) U.addSiblingLoops(NewLoops); - } // If the current loop remains valid, we should revisit it to catch any // other unswitch opportunities. Otherwise, we need to mark it as deleted. @@ -2168,12 +2154,24 @@ void postUnswitch(Loop &L, LPMUpdater &U, StringRef LoopName, if (PartiallyInvariant) { // Mark the new loop as partially unswitched, to avoid unswitching on // the same condition again. - RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.partial", - "llvm.loop.unswitch.partial.disable"); + auto &Context = L.getHeader()->getContext(); + MDNode *DisableUnswitchMD = MDNode::get( + Context, + MDString::get(Context, "llvm.loop.unswitch.partial.disable")); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, L.getLoopID(), {"llvm.loop.unswitch.partial"}, + {DisableUnswitchMD}); + L.setLoopID(NewLoopID); } else if (InjectedCondition) { // Do the same for injection of invariant conditions. - RecordLoopAsUnswitched(&L, "llvm.loop.unswitch.injection", - "llvm.loop.unswitch.injection.disable"); + auto &Context = L.getHeader()->getContext(); + MDNode *DisableUnswitchMD = MDNode::get( + Context, + MDString::get(Context, "llvm.loop.unswitch.injection.disable")); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, L.getLoopID(), {"llvm.loop.unswitch.injection"}, + {DisableUnswitchMD}); + L.setLoopID(NewLoopID); } else U.revisitCurrentLoop(); } else @@ -2811,9 +2809,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L, } /// Cost multiplier is a way to limit potentially exponential behavior -/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch -/// candidates available. Also consider the number of "sibling" loops with -/// the idea of accounting for previous unswitches that already happened on this +/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch +/// candidates available. Also accounting for the number of "sibling" loops with +/// the idea to account for previous unswitches that already happened on this /// cluster of loops. There was an attempt to keep this formula simple, /// just enough to limit the worst case behavior. Even if it is not that simple /// now it is still not an attempt to provide a detailed heuristic size @@ -3509,9 +3507,8 @@ static bool unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI, SmallVector UnswitchCandidates; IVConditionInfo PartialIVInfo; Instruction *PartialIVCondBranch = nullptr; - if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.nontrivial.disable")) - collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, - PartialIVCondBranch, L, LI, AA, MSSAU); + collectUnswitchCandidates(UnswitchCandidates, PartialIVInfo, + PartialIVCondBranch, L, LI, AA, MSSAU); if (!findOptionMDForLoop(&L, "llvm.loop.unswitch.injection.disable")) collectUnswitchCandidatesWithInjections(UnswitchCandidates, PartialIVInfo, PartialIVCondBranch, L, DT, LI, AA, diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 44e63a0583d1..b17dcb786942 100644 --- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -328,7 +328,7 @@ class StructurizeCFG { void addPhiValues(BasicBlock *From, BasicBlock *To); void findUndefBlocks(BasicBlock *PHIBlock, - const SmallSet &Incomings, + const SmallPtrSet &Incomings, SmallVector &UndefBlks) const; void mergeIfCompatible(EquivalenceClasses &PhiClasses, PHINode *A, @@ -762,7 +762,7 @@ void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { /// from some blocks as undefined. The function will find out all such blocks /// and return in \p UndefBlks. void StructurizeCFG::findUndefBlocks( - BasicBlock *PHIBlock, const SmallSet &Incomings, + BasicBlock *PHIBlock, const SmallPtrSet &Incomings, SmallVector &UndefBlks) const { // We may get a post-structured CFG like below: // @@ -788,7 +788,7 @@ void StructurizeCFG::findUndefBlocks( // path N->F2->F3->B. For example, the threads take the branch F1->N may // always take the branch F2->P2. So, when we are reconstructing a PHI // originally in B, we can safely say the incoming value from N is undefined. - SmallSet VisitedBlock; + SmallPtrSet VisitedBlock; SmallVector Stack; if (PHIBlock == ParentRegion->getExit()) { for (auto P : predecessors(PHIBlock)) { @@ -884,7 +884,7 @@ void StructurizeCFG::setPhiValues() { PhiMap &BlkPhis = OldPhiIt->second; SmallVector &UndefBlks = UndefBlksMap[To]; - SmallSet Incomings; + SmallPtrSet Incomings; // Get the undefined blocks shared by all the phi nodes. if (!BlkPhis.empty()) { diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp index 40010aee9c11..8044f611e89f 100644 --- a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp +++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp @@ -193,7 +193,7 @@ bool CanonicalizeFreezeInLoopsImpl::run() { if (Candidates.empty()) return false; - SmallSet ProcessedPHIs; + SmallPtrSet ProcessedPHIs; for (const auto &Info : Candidates) { PHINode *PHI = Info.PHI; if (!ProcessedPHIs.insert(Info.PHI).second) diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp index 4b0065d0030c..8954de618bc2 100644 --- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp +++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp @@ -276,7 +276,7 @@ std::pair ControlFlowHub::finalize( DomTreeUpdater *DTU, SmallVectorImpl &GuardBlocks, const StringRef Prefix, std::optional MaxControlFlowBooleans) { #ifndef NDEBUG - SmallSet Incoming; + SmallPtrSet Incoming; #endif SetVector Outgoing; diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index fa3c467dd12b..f49fbf8807ba 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -77,7 +77,6 @@ #include #include #include -#include #include #include #include diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index b559212de71d..ac344904f90f 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -275,7 +275,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, Builder.CreateBr(TheOnlyDest); BasicBlock *BB = SI->getParent(); - SmallSet RemovedSuccessors; + SmallPtrSet RemovedSuccessors; // Remove entries from PHI nodes which we no longer branch to... BasicBlock *SuccToKeep = TheOnlyDest; @@ -343,7 +343,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions, if (auto *BA = dyn_cast(IBI->getAddress()->stripPointerCasts())) { BasicBlock *TheOnlyDest = BA->getBasicBlock(); - SmallSet RemovedSuccessors; + SmallPtrSet RemovedSuccessors; // Insert the new branch. Builder.CreateBr(TheOnlyDest); @@ -2518,7 +2518,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA, if (MSSAU) MSSAU->changeToUnreachable(I); - SmallSet UniqueSuccessors; + SmallPtrSet UniqueSuccessors; // Loop over all of the successors, removing BB's entry from any PHI // nodes. diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 13c7ad2927d1..978d5a25a57c 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -20,7 +20,6 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp index 0ffea3f53fef..41647f7717a4 100644 --- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp +++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp @@ -8,10 +8,8 @@ #include "llvm/Transforms/Utils/ProfileVerify.h" #include "llvm/ADT/DynamicAPInt.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Analysis.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index d96f1d6c23d4..10c162bc6463 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -136,7 +136,7 @@ public: /// \p ToDelete that stores to this alloca. void updateForDeletedStore( StoreInst *ToDelete, DIBuilder &DIB, - SmallSet *DVRAssignsToDelete) const { + SmallPtrSet *DVRAssignsToDelete) const { // There's nothing to do if the alloca doesn't have any variables using // assignment tracking. if (DVRAssigns.empty()) @@ -382,7 +382,7 @@ struct PromoteMem2Reg { SmallVector AllocaATInfo; /// A set of dbg.assigns to delete because they've been demoted to /// dbg.values. Call cleanUpDbgAssigns to delete them. - SmallSet DVRAssignsToDelete; + SmallPtrSet DVRAssignsToDelete; /// The set of basic blocks the renamer has already visited. BitVector Visited; @@ -533,11 +533,10 @@ static void removeIntrinsicUsers(AllocaInst *AI) { /// false there were some loads which were not dominated by the single store /// and thus must be phi-ed with undef. We fall back to the standard alloca /// promotion algorithm in that case. -static bool -rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, - const DataLayout &DL, DominatorTree &DT, - AssumptionCache *AC, - SmallSet *DVRAssignsToDelete) { +static bool rewriteSingleStoreAlloca( + AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL, + DominatorTree &DT, AssumptionCache *AC, + SmallPtrSet *DVRAssignsToDelete) { StoreInst *OnlyStore = Info.OnlyStore; Value *ReplVal = OnlyStore->getOperand(0); // Loads may either load the stored value or uninitialized memory (undef). @@ -647,11 +646,10 @@ rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, /// use(t); /// *A = 42; /// } -static bool -promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info, - LargeBlockInfo &LBI, const DataLayout &DL, - DominatorTree &DT, AssumptionCache *AC, - SmallSet *DVRAssignsToDelete) { +static bool promoteSingleBlockAlloca( + AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI, + const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC, + SmallPtrSet *DVRAssignsToDelete) { // The trickiest case to handle is when we have large blocks. Because of this, // this code is optimized assuming that large blocks happen. This does not // significantly pessimize the small block case. This uses LargeBlockInfo to diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 1eb8996fca03..e218db30d92b 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -1346,7 +1346,7 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) { CanonicalIV->insertBefore(Header->begin()); rememberInstruction(CanonicalIV); - SmallSet PredSeen; + SmallPtrSet PredSeen; Constant *One = ConstantInt::get(Ty, 1); for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) { BasicBlock *HP = *HPI; diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 1436e479ba09..0ca7188470d8 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -291,6 +291,7 @@ class SimplifyCFGOpt { bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder); bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder); bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder); + bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI); bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI, IRBuilder<> &Builder); @@ -810,11 +811,15 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) { if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors())) CV = SI->getCondition(); } else if (BranchInst *BI = dyn_cast(TI)) - if (BI->isConditional() && BI->getCondition()->hasOneUse()) + if (BI->isConditional() && BI->getCondition()->hasOneUse()) { if (ICmpInst *ICI = dyn_cast(BI->getCondition())) { if (ICI->isEquality() && getConstantInt(ICI->getOperand(1), DL)) CV = ICI->getOperand(0); + } else if (auto *Trunc = dyn_cast(BI->getCondition())) { + if (Trunc->hasNoUnsignedWrap()) + CV = Trunc->getOperand(0); } + } // Unwrap any lossless ptrtoint cast. if (CV) { @@ -840,11 +845,20 @@ BasicBlock *SimplifyCFGOpt::getValueEqualityComparisonCases( } BranchInst *BI = cast(TI); - ICmpInst *ICI = cast(BI->getCondition()); - BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE); - Cases.push_back(ValueEqualityComparisonCase( - getConstantInt(ICI->getOperand(1), DL), Succ)); - return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); + Value *Cond = BI->getCondition(); + ICmpInst::Predicate Pred; + ConstantInt *C; + if (auto *ICI = dyn_cast(Cond)) { + Pred = ICI->getPredicate(); + C = getConstantInt(ICI->getOperand(1), DL); + } else { + Pred = ICmpInst::ICMP_NE; + auto *Trunc = cast(Cond); + C = ConstantInt::get(cast(Trunc->getOperand(0)->getType()), 0); + } + BasicBlock *Succ = BI->getSuccessor(Pred == ICmpInst::ICMP_NE); + Cases.push_back(ValueEqualityComparisonCase(C, Succ)); + return BI->getSuccessor(Pred == ICmpInst::ICMP_EQ); } /// Given a vector of bb/value pairs, remove any entries @@ -1106,7 +1120,10 @@ static void getBranchWeights(Instruction *TI, // default weight to be the first entry. if (BranchInst *BI = dyn_cast(TI)) { assert(Weights.size() == 2); - ICmpInst *ICI = cast(BI->getCondition()); + auto *ICI = dyn_cast(BI->getCondition()); + if (!ICI) + return; + if (ICI->getPredicate() == ICmpInst::ICMP_EQ) std::swap(Weights.front(), Weights.back()); } @@ -3653,15 +3670,19 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU, return false; } -static bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI, - DomTreeUpdater *DTU, - const DataLayout &DL, - AssumptionCache *AC) { +bool SimplifyCFGOpt::foldCondBranchOnValueKnownInPredecessor(BranchInst *BI) { + // Note: If BB is a loop header then there is a risk that threading introduces + // a non-canonical loop by moving a back edge. So we avoid this optimization + // for loop headers if NeedCanonicalLoop is set. + if (Options.NeedCanonicalLoop && is_contained(LoopHeaders, BI->getParent())) + return false; + std::optional Result; bool EverChanged = false; do { // Note that None means "we changed things, but recurse further." - Result = foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, AC); + Result = + foldCondBranchOnValueKnownInPredecessorImpl(BI, DTU, DL, Options.AC); EverChanged |= Result == std::nullopt || *Result; } while (Result == std::nullopt); return EverChanged; @@ -8083,7 +8104,7 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { // If this is a branch on something for which we know the constant value in // predecessors (e.g. a phi node in the current block), thread control // through this block. - if (foldCondBranchOnValueKnownInPredecessor(BI, DTU, DL, Options.AC)) + if (foldCondBranchOnValueKnownInPredecessor(BI)) return requestResimplify(); // Scan predecessor blocks for conditional branches. diff --git a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp index 6b18ecee98ec..c3ac39e5c287 100644 --- a/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp +++ b/llvm/lib/Transforms/Utils/SplitModuleByCategory.cpp @@ -12,7 +12,6 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" @@ -21,7 +20,6 @@ #include "llvm/Transforms/Utils/Cloning.h" #include -#include #include using namespace llvm; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 09c9e63ff6a2..e009b81afd0e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2633,19 +2633,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) { // Fix widened non-induction PHIs by setting up the PHI operands. fixNonInductionPHIs(State); - // After vectorization, the exit blocks of the original loop will have - // additional predecessors. Invalidate SCEVs for the exit phis in case SE - // looked through single-entry phis. - SmallVector ExitBlocks; - OrigLoop->getExitBlocks(ExitBlocks); - for (BasicBlock *Exit : ExitBlocks) - for (PHINode &PN : Exit->phis()) - PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN); - - // Forget the original basic block. - PSE.getSE()->forgetLoop(OrigLoop); - PSE.getSE()->forgetBlockAndLoopDispositions(); - // Don't apply optimizations below when no (vector) loop remains, as they all // require one at the moment. VPBasicBlock *HeaderVPBB = @@ -7305,6 +7292,7 @@ DenseMap LoopVectorizationPlanner::executePlan( BestVPlan, VectorPH, CM.foldTailByMasking(), CM.requiresScalarEpilogue(BestVF.isVector())); VPlanTransforms::materializeVFAndVFxUF(BestVPlan, VectorPH, BestVF); + VPlanTransforms::simplifyRecipes(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, @@ -7351,6 +7339,21 @@ DenseMap LoopVectorizationPlanner::executePlan( assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) && "final VPlan is invalid"); + // After vectorization, the exit blocks of the original loop will have + // additional predecessors. Invalidate SCEVs for the exit phis in case SE + // looked through single-entry phis. + ScalarEvolution &SE = *PSE.getSE(); + for (VPIRBasicBlock *Exit : BestVPlan.getExitBlocks()) { + if (Exit->getNumPredecessors() == 0) + continue; + for (VPRecipeBase &PhiR : Exit->phis()) + SE.forgetLcssaPhiWithNewPredecessor( + OrigLoop, cast(&cast(PhiR).getInstruction())); + } + // Forget the original loop and block dispositions. + SE.forgetLoop(OrigLoop); + SE.forgetBlockAndLoopDispositions(); + ILV.printDebugTracesAtStart(); //===------------------------------------------------===// @@ -8108,7 +8111,7 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { // extends are intended to be lowered along with the reduction itself. // Build up a set of partial reduction ops for efficient use checking. - SmallSet PartialReductionOps; + SmallPtrSet PartialReductionOps; for (const auto &[PartialRdx, _] : PartialReductionChains) PartialReductionOps.insert(PartialRdx.ExtendUser); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 49331702d145..37dc41413966 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -20932,6 +20932,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); SmallVector ControlDependentMembers; + SmallPtrSet Visited; for (Value *V : VL) { if (S.isNonSchedulable(V)) continue; @@ -20949,6 +20950,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); const auto *It = find(Op, I); assert(It != Op.end() && "Lane not set"); + SmallPtrSet Visited; do { int Lane = std::distance(Op.begin(), It); assert(Lane >= 0 && "Lane not set"); @@ -20958,6 +20960,10 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, assert(Lane < static_cast(EI.UserTE->Scalars.size()) && "Couldn't find extract lane"); auto *In = cast(EI.UserTE->Scalars[Lane]); + if (!Visited.insert(In).second) { + It = find(make_range(std::next(It), Op.end()), I); + break; + } ScheduleCopyableDataMapByInstUser [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)] .pop_back(); @@ -24471,7 +24477,7 @@ public: // correct, replace internal uses with undef, and mark for eventual // deletion. #ifndef NDEBUG - SmallSet IgnoreSet; + SmallPtrSet IgnoreSet; for (ArrayRef RdxOps : ReductionOps) IgnoreSet.insert_range(RdxOps); #endif diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 3682bd0e1720..724a38e56530 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -295,27 +295,11 @@ Value *VPTransformState::get(const VPValue *Def, bool NeedsScalar) { if (hasVectorValue(Def)) return Data.VPV2Vector[Def]; - auto GetBroadcastInstrs = [this, Def](Value *V) { - bool SafeToHoist = - !Def->hasDefiningRecipe() || - VPDT.properlyDominates(Def->getDefiningRecipe()->getParent(), - Plan->getVectorPreheader()); - + auto GetBroadcastInstrs = [this](Value *V) { if (VF.isScalar()) return V; - // Place the code for broadcasting invariant variables in the new preheader. - IRBuilder<>::InsertPointGuard Guard(Builder); - if (SafeToHoist) { - BasicBlock *LoopVectorPreHeader = - CFG.VPBB2IRBB[Plan->getVectorPreheader()]; - if (LoopVectorPreHeader) - Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); - } - - // Place the code for broadcasting invariant variables in the new preheader. // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast"); - return Shuf; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index fa62547d374c..0609510ac821 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2964,7 +2964,6 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, // transform, avoid computing their cost multiple times for now. Ctx.SkipCostComputation.insert(UI); - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; Type *ResultTy = Ctx.Types.inferScalarType(this); switch (UI->getOpcode()) { case Instruction::GetElementPtr: @@ -2990,7 +2989,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF, auto Op2Info = Ctx.getOperandInfo(getOperand(1)); SmallVector Operands(UI->operand_values()); return Ctx.TTI.getArithmeticInstrCost( - UI->getOpcode(), ResultTy, CostKind, + UI->getOpcode(), ResultTy, Ctx.CostKind, {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None}, Op2Info, Operands, UI, &Ctx.TLI) * (isSingleScalar() ? 1 : VF.getFixedValue()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index de0c1e4d177b..05c12b7a1adc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2291,7 +2291,7 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { } // Remove dead EVL mask. if (EVLMask->getNumUsers() == 0) - EVLMask->getDefiningRecipe()->eraseFromParent(); + ToErase.push_back(EVLMask->getDefiningRecipe()); for (VPRecipeBase *R : reverse(ToErase)) { SmallVector PossiblyDead(R->operands()); diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index ef9ea73ba994..e25ffe135418 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -183,6 +183,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { case Instruction::ZExt: case Instruction::Mul: case Instruction::FMul: + case VPInstruction::Broadcast: // Opcodes above can only use EVL after wide inductions have been // expanded. if (!VerifyLate) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index d5f1018a2c05..4e2a5c78e0ac 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -39,7 +39,6 @@ #include #include #include -#include #define DEBUG_TYPE "vector-combine" #include "llvm/Transforms/Utils/InstructionWorklist.h" @@ -112,10 +111,8 @@ private: const Instruction &I, ExtractElementInst *&ConvertToShuffle, unsigned PreferredExtractIndex); - void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - Instruction &I); - void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1, - Instruction &I); + Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I); + Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I); bool foldExtractExtract(Instruction &I); bool foldInsExtFNeg(Instruction &I); bool foldInsExtBinop(Instruction &I); @@ -145,7 +142,7 @@ private: bool shrinkLoadForShuffles(Instruction &I); bool shrinkPhiOfShuffles(Instruction &I); - void replaceValue(Value &Old, Value &New) { + void replaceValue(Instruction &Old, Value &New, bool Erase = true) { LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n'); LLVM_DEBUG(dbgs() << " With: " << New << '\n'); Old.replaceAllUsesWith(&New); @@ -154,7 +151,11 @@ private: Worklist.pushUsersToWorkList(*NewI); Worklist.pushValue(NewI); } - Worklist.pushValue(&Old); + if (Erase && isInstructionTriviallyDead(&Old)) { + eraseInstruction(Old); + } else { + Worklist.push(&Old); + } } void eraseInstruction(Instruction &I) { @@ -165,11 +166,23 @@ private: // Push remaining users of the operands and then the operand itself - allows // further folds that were hindered by OneUse limits. - for (Value *Op : Ops) - if (auto *OpI = dyn_cast(Op)) { - Worklist.pushUsersToWorkList(*OpI); - Worklist.pushValue(OpI); + SmallPtrSet Visited; + for (Value *Op : Ops) { + if (Visited.insert(Op).second) { + if (auto *OpI = dyn_cast(Op)) { + if (RecursivelyDeleteTriviallyDeadInstructions( + OpI, nullptr, nullptr, [this](Value *V) { + if (auto I = dyn_cast(V)) { + LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n'); + Worklist.remove(I); + } + })) + continue; + Worklist.pushUsersToWorkList(*OpI); + Worklist.pushValue(OpI); + } } + } } }; } // namespace @@ -553,9 +566,8 @@ static Value *createShiftShuffle(Value *Vec, unsigned OldIndex, /// the source vector (shift the scalar element) to a NewIndex for extraction. /// Return null if the input can be constant folded, so that we are not creating /// unnecessary instructions. -static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, - unsigned NewIndex, - IRBuilderBase &Builder) { +static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, + IRBuilderBase &Builder) { // Shufflevectors can only be created for fixed-width vectors. Value *X = ExtElt->getVectorOperand(); if (!isa(X->getType())) @@ -570,52 +582,43 @@ static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt, Value *Shuf = createShiftShuffle(X, cast(C)->getZExtValue(), NewIndex, Builder); - return dyn_cast( - Builder.CreateExtractElement(Shuf, NewIndex)); + return Shuf; } /// Try to reduce extract element costs by converting scalar compares to vector /// compares followed by extract. -/// cmp (ext0 V0, C), (ext1 V1, C) -void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0, - ExtractElementInst *Ext1, Instruction &I) { +/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex) +Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, + Instruction &I) { assert(isa(&I) && "Expected a compare"); - assert(cast(Ext0->getIndexOperand())->getZExtValue() == - cast(Ext1->getIndexOperand())->getZExtValue() && - "Expected matching constant extract indexes"); - // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C + // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex) + // --> extelt (cmp Pred V0, V1), ExtIndex ++NumVecCmp; CmpInst::Predicate Pred = cast(&I)->getPredicate(); - Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); Value *VecCmp = Builder.CreateCmp(Pred, V0, V1); - Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand()); - replaceValue(I, *NewExt); + return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp"); } /// Try to reduce extract element costs by converting scalar binops to vector /// binops followed by extract. -/// bo (ext0 V0, C), (ext1 V1, C) -void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0, - ExtractElementInst *Ext1, Instruction &I) { +/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex) +Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, + Instruction &I) { assert(isa(&I) && "Expected a binary operator"); - assert(cast(Ext0->getIndexOperand())->getZExtValue() == - cast(Ext1->getIndexOperand())->getZExtValue() && - "Expected matching constant extract indexes"); - // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C + // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex) + // --> extelt (bo V0, V1), ExtIndex ++NumVecBO; - Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand(); - Value *VecBO = - Builder.CreateBinOp(cast(&I)->getOpcode(), V0, V1); + Value *VecBO = Builder.CreateBinOp(cast(&I)->getOpcode(), V0, + V1, "foldExtExtBinop"); // All IR flags are safe to back-propagate because any potential poison // created in unused vector elements is discarded by the extract. if (auto *VecBOInst = dyn_cast(VecBO)) VecBOInst->copyIRFlags(&I); - Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand()); - replaceValue(I, *NewExt); + return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop"); } /// Match an instruction with extracted vector operands. @@ -654,25 +657,29 @@ bool VectorCombine::foldExtractExtract(Instruction &I) { if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex)) return false; + Value *ExtOp0 = Ext0->getVectorOperand(); + Value *ExtOp1 = Ext1->getVectorOperand(); + if (ExtractToChange) { unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0; - ExtractElementInst *NewExtract = + Value *NewExtOp = translateExtract(ExtractToChange, CheapExtractIdx, Builder); - if (!NewExtract) + if (!NewExtOp) return false; if (ExtractToChange == Ext0) - Ext0 = NewExtract; + ExtOp0 = NewExtOp; else - Ext1 = NewExtract; + ExtOp1 = NewExtOp; } - if (Pred != CmpInst::BAD_ICMP_PREDICATE) - foldExtExtCmp(Ext0, Ext1, I); - else - foldExtExtBinop(Ext0, Ext1, I); - + Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand() + : Ext0->getIndexOperand(); + Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE + ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I) + : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I); Worklist.push(Ext0); Worklist.push(Ext1); + replaceValue(I, *NewExt); return true; } @@ -1825,7 +1832,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) { LI->getAlign(), VecTy->getElementType(), Idx, *DL); NewLoad->setAlignment(ScalarOpAlignment); - replaceValue(*EI, *NewLoad); + replaceValue(*EI, *NewLoad, false); } FailureGuard.release(); @@ -3113,7 +3120,7 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask); LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n"); replaceValue(*Shuffle, *NewShuffle); - MadeChanges = true; + return true; } // See if we can re-use foldSelectShuffle, getting it to reduce the size of @@ -3609,7 +3616,7 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) { for (int S = 0, E = ReconstructMasks.size(); S != E; S++) { Builder.SetInsertPoint(Shuffles[S]); Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]); - replaceValue(*Shuffles[S], *NSV); + replaceValue(*Shuffles[S], *NSV, false); } Worklist.pushValue(NSV0A); @@ -3980,7 +3987,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) { Value *NewShuffle = Builder.CreateShuffleVector( NewLoad, PoisonValue::get(NewLoadTy), NewMask); - replaceValue(*Shuffle, *NewShuffle); + replaceValue(*Shuffle, *NewShuffle, false); } return true; @@ -4096,8 +4103,7 @@ bool VectorCombine::run() { LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n"); - bool MadeChange = false; - auto FoldInst = [this, &MadeChange](Instruction &I) { + auto FoldInst = [this](Instruction &I) { Builder.SetInsertPoint(&I); bool IsVectorType = isa(I.getType()); bool IsFixedVectorType = isa(I.getType()); @@ -4112,10 +4118,12 @@ bool VectorCombine::run() { if (IsFixedVectorType) { switch (Opcode) { case Instruction::InsertElement: - MadeChange |= vectorizeLoadInsert(I); + if (vectorizeLoadInsert(I)) + return true; break; case Instruction::ShuffleVector: - MadeChange |= widenSubvectorLoad(I); + if (widenSubvectorLoad(I)) + return true; break; default: break; @@ -4125,19 +4133,25 @@ bool VectorCombine::run() { // This transform works with scalable and fixed vectors // TODO: Identify and allow other scalable transforms if (IsVectorType) { - MadeChange |= scalarizeOpOrCmp(I); - MadeChange |= scalarizeLoadExtract(I); - MadeChange |= scalarizeExtExtract(I); - MadeChange |= scalarizeVPIntrinsic(I); - MadeChange |= foldInterleaveIntrinsics(I); + if (scalarizeOpOrCmp(I)) + return true; + if (scalarizeLoadExtract(I)) + return true; + if (scalarizeExtExtract(I)) + return true; + if (scalarizeVPIntrinsic(I)) + return true; + if (foldInterleaveIntrinsics(I)) + return true; } if (Opcode == Instruction::Store) - MadeChange |= foldSingleElementStore(I); + if (foldSingleElementStore(I)) + return true; // If this is an early pipeline invocation of this pass, we are done. if (TryEarlyFoldsOnly) - return; + return false; // Otherwise, try folds that improve codegen but may interfere with // early IR canonicalizations. @@ -4146,62 +4160,87 @@ bool VectorCombine::run() { if (IsFixedVectorType) { switch (Opcode) { case Instruction::InsertElement: - MadeChange |= foldInsExtFNeg(I); - MadeChange |= foldInsExtBinop(I); - MadeChange |= foldInsExtVectorToShuffle(I); + if (foldInsExtFNeg(I)) + return true; + if (foldInsExtBinop(I)) + return true; + if (foldInsExtVectorToShuffle(I)) + return true; break; case Instruction::ShuffleVector: - MadeChange |= foldPermuteOfBinops(I); - MadeChange |= foldShuffleOfBinops(I); - MadeChange |= foldShuffleOfSelects(I); - MadeChange |= foldShuffleOfCastops(I); - MadeChange |= foldShuffleOfShuffles(I); - MadeChange |= foldShuffleOfIntrinsics(I); - MadeChange |= foldSelectShuffle(I); - MadeChange |= foldShuffleToIdentity(I); + if (foldPermuteOfBinops(I)) + return true; + if (foldShuffleOfBinops(I)) + return true; + if (foldShuffleOfSelects(I)) + return true; + if (foldShuffleOfCastops(I)) + return true; + if (foldShuffleOfShuffles(I)) + return true; + if (foldShuffleOfIntrinsics(I)) + return true; + if (foldSelectShuffle(I)) + return true; + if (foldShuffleToIdentity(I)) + return true; break; case Instruction::Load: - MadeChange |= shrinkLoadForShuffles(I); + if (shrinkLoadForShuffles(I)) + return true; break; case Instruction::BitCast: - MadeChange |= foldBitcastShuffle(I); + if (foldBitcastShuffle(I)) + return true; break; case Instruction::And: case Instruction::Or: case Instruction::Xor: - MadeChange |= foldBitOpOfCastops(I); + if (foldBitOpOfCastops(I)) + return true; break; case Instruction::PHI: - MadeChange |= shrinkPhiOfShuffles(I); + if (shrinkPhiOfShuffles(I)) + return true; break; default: - MadeChange |= shrinkType(I); + if (shrinkType(I)) + return true; break; } } else { switch (Opcode) { case Instruction::Call: - MadeChange |= foldShuffleFromReductions(I); - MadeChange |= foldCastFromReductions(I); + if (foldShuffleFromReductions(I)) + return true; + if (foldCastFromReductions(I)) + return true; break; case Instruction::ICmp: case Instruction::FCmp: - MadeChange |= foldExtractExtract(I); + if (foldExtractExtract(I)) + return true; break; case Instruction::Or: - MadeChange |= foldConcatOfBoolMasks(I); + if (foldConcatOfBoolMasks(I)) + return true; [[fallthrough]]; default: if (Instruction::isBinaryOp(Opcode)) { - MadeChange |= foldExtractExtract(I); - MadeChange |= foldExtractedCmps(I); - MadeChange |= foldBinopOfReductions(I); + if (foldExtractExtract(I)) + return true; + if (foldExtractedCmps(I)) + return true; + if (foldBinopOfReductions(I)) + return true; } break; } } + return false; }; + bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. if (!DT.isReachableFromEntry(&BB)) @@ -4210,7 +4249,7 @@ bool VectorCombine::run() { for (Instruction &I : make_early_inc_range(BB)) { if (I.isDebugOrPseudoInst()) continue; - FoldInst(I); + MadeChange |= FoldInst(I); } } @@ -4224,7 +4263,7 @@ bool VectorCombine::run() { continue; } - FoldInst(*I); + MadeChange |= FoldInst(*I); } return MadeChange; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll index 16762dc4fd3f..c791e35946f7 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll @@ -1,34 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py ; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -disable-expand-reductions -stop-after=irtranslator %s -o - | FileCheck %s -declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) -declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>) - define float @fadd_seq(float %start, <4 x float> %vec) { ; CHECK-LABEL: name: fadd_seq ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q1, $s0 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) - ; CHECK: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec) ret float %res } +define float @fadd_seq_scalar(float %start, <1 x float> %vec) { + ; CHECK-LABEL: name: fadd_seq_scalar + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $d1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $d1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>) + ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[UV]] + ; CHECK-NEXT: $s0 = COPY [[FADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 + %res = call float @llvm.vector.reduce.fadd.v1f32(float %start, <1 x float> %vec) + ret float %res +} + define float @fadd_fast(float %start, <4 x float> %vec) { ; CHECK-LABEL: name: fadd_fast ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q1, $s0 - ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) - ; CHECK: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[BITCAST]](<4 x s32>) - ; CHECK: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]] - ; CHECK: $s0 = COPY [[FADD]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q1, $s0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $s0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]] + ; CHECK-NEXT: $s0 = COPY [[FADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec) ret float %res } @@ -36,48 +50,60 @@ define float @fadd_fast(float %start, <4 x float> %vec) { define double @fmul_seq(double %start, <4 x double> %vec) { ; CHECK-LABEL: name: fmul_seq ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $d0, $q1, $q2 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>) - ; CHECK: [[VECREDUCE_SEQ_FMUL:%[0-9]+]]:_(s64) = G_VECREDUCE_SEQ_FMUL [[COPY]](s64), [[CONCAT_VECTORS]](<4 x s64>) - ; CHECK: $d0 = COPY [[VECREDUCE_SEQ_FMUL]](s64) - ; CHECK: RET_ReallyLR implicit $d0 + ; CHECK-NEXT: liveins: $d0, $q1, $q2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_SEQ_FMUL:%[0-9]+]]:_(s64) = G_VECREDUCE_SEQ_FMUL [[COPY]](s64), [[CONCAT_VECTORS]](<4 x s64>) + ; CHECK-NEXT: $d0 = COPY [[VECREDUCE_SEQ_FMUL]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 %res = call double @llvm.vector.reduce.fmul.v4f64(double %start, <4 x double> %vec) ret double %res } +define double @fmul_seq_scalar(double %start, <1 x double> %vec) { + ; CHECK-LABEL: name: fmul_seq_scalar + ; CHECK: bb.1 (%ir-block.0): + ; CHECK-NEXT: liveins: $d0, $d1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1 + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY1]] + ; CHECK-NEXT: $d0 = COPY [[FMUL]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 + %res = call double @llvm.vector.reduce.fmul.v1f64(double %start, <1 x double> %vec) + ret double %res +} + define double @fmul_fast(double %start, <4 x double> %vec) { ; CHECK-LABEL: name: fmul_fast ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $d0, $q1, $q2 - ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 - ; CHECK: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 - ; CHECK: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 - ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMUL:%[0-9]+]]:_(s64) = reassoc G_VECREDUCE_FMUL [[CONCAT_VECTORS]](<4 x s64>) - ; CHECK: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[COPY]], [[VECREDUCE_FMUL]] - ; CHECK: $d0 = COPY [[FMUL]](s64) - ; CHECK: RET_ReallyLR implicit $d0 + ; CHECK-NEXT: liveins: $d0, $q1, $q2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMUL:%[0-9]+]]:_(s64) = reassoc G_VECREDUCE_FMUL [[CONCAT_VECTORS]](<4 x s64>) + ; CHECK-NEXT: [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[COPY]], [[VECREDUCE_FMUL]] + ; CHECK-NEXT: $d0 = COPY [[FMUL]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $d0 %res = call reassoc double @llvm.vector.reduce.fmul.v4f64(double %start, <4 x double> %vec) ret double %res } -declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) -declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) -declare float @llvm.vector.reduce.fmaximum.v4f32(<4 x float>) -declare float @llvm.vector.reduce.fminimum.v4f32(<4 x float>) - define float @fmax(<4 x float> %vec) { ; CHECK-LABEL: name: fmax ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_FMAX]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_FMAX]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %vec) ret float %res } @@ -85,12 +111,13 @@ define float @fmax(<4 x float> %vec) { define float @fmin(<4 x float> %vec) { ; CHECK-LABEL: name: fmin ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_FMIN]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec) ret float %res } @@ -98,12 +125,13 @@ define float @fmin(<4 x float> %vec) { define float @fmin_nnan(<4 x float> %vec) { ; CHECK-LABEL: name: fmin_nnan ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_FMIN]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec) ret float %res } @@ -111,12 +139,13 @@ define float @fmin_nnan(<4 x float> %vec) { define float @fmaximum(<4 x float> %vec) { ; CHECK-LABEL: name: fmaximum ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAXIMUM [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_FMAX]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMAXIMUM:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAXIMUM [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_FMAXIMUM]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fmaximum.v4f32(<4 x float> %vec) ret float %res } @@ -124,12 +153,13 @@ define float @fmaximum(<4 x float> %vec) { define float @fminimum(<4 x float> %vec) { ; CHECK-LABEL: name: fminimum ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMINIMUM [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMINIMUM:%[0-9]+]]:_(s32) = G_VECREDUCE_FMINIMUM [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_FMINIMUM]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %vec) ret float %res } @@ -137,99 +167,91 @@ define float @fminimum(<4 x float> %vec) { define float @fminimum_nnan(<4 x float> %vec) { ; CHECK-LABEL: name: fminimum_nnan ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 - ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) - ; CHECK: [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMINIMUM [[BITCAST]](<4 x s32>) - ; CHECK: $s0 = COPY [[VECREDUCE_FMIN]](s32) - ; CHECK: RET_ReallyLR implicit $s0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK-NEXT: [[VECREDUCE_FMINIMUM:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMINIMUM [[BITCAST]](<4 x s32>) + ; CHECK-NEXT: $s0 = COPY [[VECREDUCE_FMINIMUM]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $s0 %res = call nnan float @llvm.vector.reduce.fminimum.v4f32(<4 x float> %vec) ret float %res } -declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) - define i32 @add(<4 x i32> %vec) { ; CHECK-LABEL: name: add ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_ADD]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_ADD]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %vec) ret i32 %res } -declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) - define i32 @mul(<4 x i32> %vec) { ; CHECK-LABEL: name: mul ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_MUL:%[0-9]+]]:_(s32) = G_VECREDUCE_MUL [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_MUL]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_MUL:%[0-9]+]]:_(s32) = G_VECREDUCE_MUL [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_MUL]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %vec) ret i32 %res } -declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) - define i32 @and(<4 x i32> %vec) { ; CHECK-LABEL: name: and ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_AND:%[0-9]+]]:_(s32) = G_VECREDUCE_AND [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_AND]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_AND:%[0-9]+]]:_(s32) = G_VECREDUCE_AND [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_AND]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %vec) ret i32 %res } -declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) - define i32 @or(<4 x i32> %vec) { ; CHECK-LABEL: name: or ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_OR:%[0-9]+]]:_(s32) = G_VECREDUCE_OR [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_OR]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_OR:%[0-9]+]]:_(s32) = G_VECREDUCE_OR [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_OR]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %vec) ret i32 %res } -declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) - define i32 @xor(<4 x i32> %vec) { ; CHECK-LABEL: name: xor ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_XOR:%[0-9]+]]:_(s32) = G_VECREDUCE_XOR [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_XOR]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_XOR:%[0-9]+]]:_(s32) = G_VECREDUCE_XOR [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_XOR]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %vec) ret i32 %res } -declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) -declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) -declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) -declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) - define i32 @smax(<4 x i32> %vec) { ; CHECK-LABEL: name: smax ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_SMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_SMAX [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_SMAX]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_SMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_SMAX [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_SMAX]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %vec) ret i32 %res } @@ -237,11 +259,12 @@ define i32 @smax(<4 x i32> %vec) { define i32 @smin(<4 x i32> %vec) { ; CHECK-LABEL: name: smin ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_SMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_SMIN [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_SMIN]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_SMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_SMIN [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_SMIN]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %vec) ret i32 %res } @@ -249,11 +272,12 @@ define i32 @smin(<4 x i32> %vec) { define i32 @umax(<4 x i32> %vec) { ; CHECK-LABEL: name: umax ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_UMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_UMAX [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_UMAX]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_UMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_UMAX [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_UMAX]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %vec) ret i32 %res } @@ -261,11 +285,12 @@ define i32 @umax(<4 x i32> %vec) { define i32 @umin(<4 x i32> %vec) { ; CHECK-LABEL: name: umin ; CHECK: bb.1 (%ir-block.0): - ; CHECK: liveins: $q0 - ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 - ; CHECK: [[VECREDUCE_UMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_UMIN [[COPY]](<4 x s32>) - ; CHECK: $w0 = COPY [[VECREDUCE_UMIN]](s32) - ; CHECK: RET_ReallyLR implicit $w0 + ; CHECK-NEXT: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[VECREDUCE_UMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_UMIN [[COPY]](<4 x s32>) + ; CHECK-NEXT: $w0 = COPY [[VECREDUCE_UMIN]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %vec) ret i32 %res } diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-limit-size.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-limit-size.mir new file mode 100644 index 000000000000..17c15124e787 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-limit-size.mir @@ -0,0 +1,33 @@ +# RUN: llc -run-pass=machine-combiner -aarch64-search-limit=2 -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: negative_pattern_mbb_too_large +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: negative_pattern_mbb_too_large + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir new file mode 100644 index 000000000000..6b338d98afb5 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes-with-call.mir @@ -0,0 +1,45 @@ +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + + +--- | + @external_func = external global i32 + define void @negative_pattern_offset_reg_copied_to_physical(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4) { + entry: + ret void + } +... +--- +name: negative_pattern_offset_reg_copied_to_physical +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: negative_pattern_offset_reg_copied_to_physical + ; CHECK: [[BASE_REG:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[PTR_1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[PTR_2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[PTR_3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[BASE_REG]], killed [[PTR_1]], 0, 1 + ; CHECK-NEXT: [[LD_LANE_0:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_0]], 1, [[PTR_2]] + ; CHECK-NEXT: $x0 = COPY [[PTR_2]] + ; CHECK-NEXT: BL @external_func, csr_aarch64_aapcs, implicit-def $lr, implicit $x0, implicit-def $x0 + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[PTR_2]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[PTR_3]] + ; CHECK-NEXT: [[RESULT:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, %2 + $x0 = COPY %2 + BL @external_func, csr_aarch64_aapcs, implicit-def $lr, implicit $x0, implicit-def $x0 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + %10:gpr64common = COPY $x0 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 \ No newline at end of file diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir new file mode 100644 index 000000000000..a7570d2293f8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-combine-gather-lanes.mir @@ -0,0 +1,400 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -run-pass=machine-combiner -mcpu=neoverse-n2 -mtriple=aarch64-none-linux-gnu -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: split_loads_to_fpr128 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, [[COPY2]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY3]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, %2 + %8:fpr128 = LD1i32 %7, 2, %3 + %9:fpr128 = LD1i32 %8, 3, %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: split_loads_to_fpr128_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSui %0, 0 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %1 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY5]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY7]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY8]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHroX %0, killed %1, 0, 1 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %2 + %12:fpr128 = LD1i16 %11, 2, killed %3 + %13:fpr128 = LD1i16 %12, 3, killed %4 + %14:fpr128 = LD1i16 %13, 4, killed %5 + %15:fpr128 = LD1i16 %14, 5, killed %6 + %16:fpr128 = LD1i16 %15, 6, killed %7 + %17:fpr128 = LD1i16 %16, 7, killed %8 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i16_ui +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i16_ui + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[LD_i16:%[0-9]+]]:fpr16 = LDRHui [[COPY]], 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i16]], %subreg.hsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i16 [[FIRST_REG]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i16 [[LD0_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i16 [[LD0_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr16 = LDRHui [[COPY4]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.hsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i16 [[SECOND_REG]], 1, killed [[COPY5]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i16 [[LD1_1]], 2, killed [[COPY6]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i16 [[LD1_2]], 3, killed [[COPY7]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_3]], [[LD1_3]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:fpr16 = LDRHui %0, 0 + %10:fpr128 = SUBREG_TO_REG 0, killed %9, %subreg.hsub + %11:fpr128 = LD1i16 %10, 1, killed %1 + %12:fpr128 = LD1i16 %11, 2, killed %2 + %13:fpr128 = LD1i16 %12, 3, killed %3 + %14:fpr128 = LD1i16 %13, 4, killed %4 + %15:fpr128 = LD1i16 %14, 5, killed %5 + %16:fpr128 = LD1i16 %15, 6, killed %6 + %17:fpr128 = LD1i16 %16, 7, killed %7 + $q0 = COPY %17 + RET_ReallyLR implicit $q0 + +--- +name: split_loads_to_fpr128_i8 +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x8, $x9, $x10, $x11, $x12, $x13, $x14, $x15, $x16 + + ; CHECK-LABEL: name: split_loads_to_fpr128_i8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64common = COPY $x5 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64common = COPY $x6 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr64common = COPY $x7 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr64common = COPY $x8 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gpr64common = COPY $x9 + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr64common = COPY $x10 + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr64common = COPY $x11 + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr64common = COPY $x12 + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gpr64common = COPY $x13 + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gpr64common = COPY $x14 + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr64common = COPY $x15 + ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr64common = COPY $x16 + ; CHECK-NEXT: [[LD_i8:%[0-9]+]]:fpr8 = LDRBroX [[COPY]], killed [[COPY1]], 0, 0 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i8]], %subreg.bsub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i8 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD0_2:%[0-9]+]]:fpr128 = LD1i8 [[LD0_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD0_3:%[0-9]+]]:fpr128 = LD1i8 [[LD0_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: [[LD0_4:%[0-9]+]]:fpr128 = LD1i8 [[LD0_3]], 4, killed [[COPY5]] + ; CHECK-NEXT: [[LD0_5:%[0-9]+]]:fpr128 = LD1i8 [[LD0_4]], 5, killed [[COPY6]] + ; CHECK-NEXT: [[LD0_6:%[0-9]+]]:fpr128 = LD1i8 [[LD0_5]], 6, killed [[COPY7]] + ; CHECK-NEXT: [[LD0_7:%[0-9]+]]:fpr128 = LD1i8 [[LD0_6]], 7, killed [[COPY8]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr8 = LDRBui [[COPY9]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.bsub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i8 [[SECOND_REG]], 1, killed [[COPY10]] + ; CHECK-NEXT: [[LD1_2:%[0-9]+]]:fpr128 = LD1i8 [[LD1_1]], 2, killed [[COPY11]] + ; CHECK-NEXT: [[LD1_3:%[0-9]+]]:fpr128 = LD1i8 [[LD1_2]], 3, killed [[COPY12]] + ; CHECK-NEXT: [[LD1_4:%[0-9]+]]:fpr128 = LD1i8 [[LD1_3]], 4, killed [[COPY13]] + ; CHECK-NEXT: [[LD1_5:%[0-9]+]]:fpr128 = LD1i8 [[LD1_4]], 5, killed [[COPY14]] + ; CHECK-NEXT: [[LD1_6:%[0-9]+]]:fpr128 = LD1i8 [[LD1_5]], 6, killed [[COPY15]] + ; CHECK-NEXT: [[LD1_7:%[0-9]+]]:fpr128 = LD1i8 [[LD1_6]], 7, killed [[COPY16]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_7]], [[LD1_7]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:gpr64common = COPY $x5 + %6:gpr64common = COPY $x6 + %7:gpr64common = COPY $x7 + %8:gpr64common = COPY $x8 + %9:gpr64common = COPY $x9 + %10:gpr64common = COPY $x10 + %11:gpr64common = COPY $x11 + %12:gpr64common = COPY $x12 + %13:gpr64common = COPY $x13 + %14:gpr64common = COPY $x14 + %15:gpr64common = COPY $x15 + %16:gpr64common = COPY $x16 + %17:fpr8 = LDRBroX %0, killed %1, 0, 0 + %18:fpr128 = SUBREG_TO_REG 0, killed %17, %subreg.bsub + %19:fpr128 = LD1i8 %18, 1, killed %2 + %20:fpr128 = LD1i8 %19, 2, killed %3 + %21:fpr128 = LD1i8 %20, 3, killed %4 + %22:fpr128 = LD1i8 %21, 4, killed %5 + %23:fpr128 = LD1i8 %22, 5, killed %6 + %24:fpr128 = LD1i8 %23, 6, killed %7 + %25:fpr128 = LD1i8 %24, 7, killed %8 + %26:fpr128 = LD1i8 %25, 8, killed %9 + %27:fpr128 = LD1i8 %26, 9, killed %10 + %28:fpr128 = LD1i8 %27, 10, killed %11 + %29:fpr128 = LD1i8 %28, 11, killed %12 + %30:fpr128 = LD1i8 %29, 12, killed %13 + %31:fpr128 = LD1i8 %30, 13, killed %14 + %32:fpr128 = LD1i8 %31, 14, killed %15 + %33:fpr128 = LD1i8 %32, 15, killed %16 + $q0 = COPY %33 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_missing_lanes +body: | + bb.0.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: negative_pattern_missing_lanes + ; CHECK: [[LD1:%.*]]:fpr128 = LDRQui $x1, 0 + ; CHECK-NEXT: [[LD2:%.*]]:fpr128 = LD1i32 [[LD1]] + + %0:gpr64common = COPY $x0 + %1:fpr128 = LDRQui $x1, 0 + %2:fpr128 = LD1i32 %1, 3, %0 + $q0 = COPY %2 + RET_ReallyLR implicit $q0 + +--- +name: out_of_order_lanes +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: out_of_order_lanes + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD0_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY3]] + ; CHECK-NEXT: [[LD1_0:%[0-9]+]]:fpr32 = LDRSui [[COPY2]], 0 + ; CHECK-NEXT: [[SECOND_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD1_0]], %subreg.ssub + ; CHECK-NEXT: [[LD1_1:%[0-9]+]]:fpr128 = LD1i32 [[SECOND_REG]], 1, killed [[COPY4]] + ; CHECK-NEXT: [[ZIP:%[0-9]+]]:fpr128 = ZIP1v2i64 [[LD0_1]], [[LD1_1]] + ; CHECK-NEXT: $q0 = COPY [[ZIP]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 2, killed %2 + %8:fpr128 = LD1i32 %7, 1, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_no_subreg_to_reg +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: negative_pattern_no_subreg_to_reg + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[INITIAL_VEC:%[0-9]+]]:fpr128 = LDRQui [[COPY]], 0 + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[INITIAL_VEC]], 1, killed [[COPY1]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY3]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:fpr128 = LDRQui %0, 0 + %5:fpr128 = LD1i32 %4, 1, killed %1 + %6:fpr128 = LD1i32 %5, 2, killed %2 + %7:fpr128 = LD1i32 %6, 3, killed %3 + $q0 = COPY %7 + RET_ReallyLR implicit $q0 + +--- +name: negative_pattern_multiple_users +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3, $x4 + + ; CHECK-LABEL: name: negative_pattern_multiple_users + ; CHECK: [[COPY:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64common = COPY $x4 + ; CHECK-NEXT: [[LD_i32:%[0-9]+]]:fpr32 = LDRSroX [[COPY]], killed [[COPY1]], 0, 1 + ; CHECK-NEXT: [[FIRST_REG:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LD_i32]], %subreg.ssub + ; CHECK-NEXT: [[LD_LANE_1:%[0-9]+]]:fpr128 = LD1i32 [[FIRST_REG]], 1, killed [[COPY2]] + ; CHECK-NEXT: [[LD_LANE_2:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_1]], 2, killed [[COPY3]] + ; CHECK-NEXT: [[LD_LANE_3:%[0-9]+]]:fpr128 = LD1i32 [[LD_LANE_2]], 3, killed [[COPY4]] + ; CHECK-NEXT: $q0 = COPY [[LD_LANE_3]] + ; CHECK-NEXT: $q1 = COPY [[LD_LANE_2]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %4:gpr64common = COPY $x4 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, killed %2 + %8:fpr128 = LD1i32 %7, 2, killed %3 + %9:fpr128 = LD1i32 %8, 3, killed %4 + $q0 = COPY %9 + $q1 = COPY %8 + RET_ReallyLR implicit $q0, implicit $q1 + +--- +name: aliasing_store_between_vector_loads +alignment: 4 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x0, $x1, $x2, $x3 + + ; CHECK-LABEL: name: aliasing_store_between_vector_loads + ; CHECK: [[BASE_PTR:%[0-9]+]]:gpr64common = COPY $x0 + ; CHECK-NEXT: [[OFFSET_PTR:%[0-9]+]]:gpr64common = COPY $x1 + ; CHECK-NEXT: [[ALIAS_ADDR:%[0-9]+]]:gpr64common = COPY $x2 + ; CHECK-NEXT: [[OTHER_ADDR:%[0-9]+]]:gpr64common = COPY $x3 + ; CHECK-NEXT: [[LOAD0:%[0-9]+]]:fpr32 = LDRSroX [[BASE_PTR]], killed [[OFFSET_PTR]], 0, 1 + ; CHECK-NEXT: [[VEC0:%[0-9]+]]:fpr128 = SUBREG_TO_REG 0, killed [[LOAD0]], %subreg.ssub + ; CHECK-NEXT: [[VEC1:%[0-9]+]]:fpr128 = LD1i32 [[VEC0]], 1, [[ALIAS_ADDR]] + ; CHECK-NEXT: [[CONST:%[0-9]+]]:gpr32 = MOVi32imm 99 + ; CHECK-NEXT: STRWui [[CONST]], [[ALIAS_ADDR]], 0 + ; CHECK-NEXT: [[VEC2:%[0-9]+]]:fpr128 = LD1i32 [[VEC1]], 2, killed [[ALIAS_ADDR]] + ; CHECK-NEXT: [[VEC3:%[0-9]+]]:fpr128 = LD1i32 [[VEC2]], 3, killed [[OTHER_ADDR]] + ; CHECK-NEXT: $q0 = COPY [[VEC3]] + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %0:gpr64common = COPY $x0 + %1:gpr64common = COPY $x1 + %2:gpr64common = COPY $x2 + %3:gpr64common = COPY $x3 + %5:fpr32 = LDRSroX %0, killed %1, 0, 1 + %6:fpr128 = SUBREG_TO_REG 0, killed %5, %subreg.ssub + %7:fpr128 = LD1i32 %6, 1, %2 + %10:gpr32 = MOVi32imm 99 + STRWui %10, %2, 0 + %8:fpr128 = LD1i32 %7, 2, killed %2 + %9:fpr128 = LD1i32 %8, 3, killed %3 + $q0 = COPY %9 + RET_ReallyLR implicit $q0 diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll index 7686740aec30..13434fabefa7 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll @@ -203,89 +203,93 @@ define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 ; CHECK-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-NEXT: ldr s17, [sp, #40] -; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: mov v1.s[1], v3.s[0] -; CHECK-NEXT: ldr s3, [sp, #32] ; CHECK-NEXT: // kill: def $s2 killed $s2 def $q2 -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ldr s17, [sp, #32] ; CHECK-NEXT: // kill: def $s5 killed $s5 def $q5 -; CHECK-NEXT: ldr s16, [sp, #8] +; CHECK-NEXT: add x9, sp, #48 +; CHECK-NEXT: add x10, sp, #64 +; CHECK-NEXT: mov v1.s[1], v3.s[0] +; CHECK-NEXT: mov v0.s[1], v2.s[0] ; CHECK-NEXT: // kill: def $s4 killed $s4 def $q4 -; CHECK-NEXT: add x10, sp, #24 -; CHECK-NEXT: ld1 { v3.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: add x11, sp, #72 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ldr s18, [x10] +; CHECK-NEXT: add x9, sp, #80 +; CHECK-NEXT: add x10, sp, #56 ; CHECK-NEXT: // kill: def $s6 killed $s6 def $q6 -; CHECK-NEXT: ldr s2, [sp] -; CHECK-NEXT: ld1 { v16.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: ldr s20, [sp, #136] -; CHECK-NEXT: mov v1.s[2], v5.s[0] -; CHECK-NEXT: ld1 { v17.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ldr s5, [sp, #96] -; CHECK-NEXT: ld1 { v3.s }[2], [x9] -; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: // kill: def $s7 killed $s7 def $q7 +; CHECK-NEXT: ldr s16, [sp, #8] +; CHECK-NEXT: ldr s3, [sp, #96] +; CHECK-NEXT: ld1 { v18.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #88 -; CHECK-NEXT: ldr s4, [sp, #104] -; CHECK-NEXT: ldr s19, [sp, #192] +; CHECK-NEXT: ldr s2, [sp] +; CHECK-NEXT: mov v1.s[2], v5.s[0] +; CHECK-NEXT: ldr s5, [sp, #40] +; CHECK-NEXT: mov v0.s[2], v4.s[0] ; CHECK-NEXT: ld1 { v5.s }[1], [x10] -; CHECK-NEXT: add x10, sp, #80 -; CHECK-NEXT: ld1 { v17.s }[3], [x9] -; CHECK-NEXT: mov v1.s[3], v7.s[0] -; CHECK-NEXT: add x9, sp, #120 -; CHECK-NEXT: ld1 { v3.s }[3], [x10] -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: ldr s7, [sp, #128] +; CHECK-NEXT: ldr s19, [x11] ; CHECK-NEXT: add x10, sp, #144 +; CHECK-NEXT: zip1 v4.2d, v17.2d, v18.2d +; CHECK-NEXT: add x11, sp, #160 +; CHECK-NEXT: ldr s18, [sp, #136] +; CHECK-NEXT: ld1 { v19.s }[1], [x9] ; CHECK-NEXT: mov v0.s[3], v6.s[0] -; CHECK-NEXT: add x9, sp, #16 +; CHECK-NEXT: ldr s6, [sp, #128] +; CHECK-NEXT: mov v1.s[3], v7.s[0] +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ldr s7, [sp, #104] +; CHECK-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ld1 { v6.s }[1], [x10] +; CHECK-NEXT: zip1 v5.2d, v5.2d, v19.2d +; CHECK-NEXT: add x10, sp, #120 +; CHECK-NEXT: ld1 { v3.s }[1], [x9] ; CHECK-NEXT: ld1 { v7.s }[1], [x10] -; CHECK-NEXT: ld1 { v2.s }[1], [x9] -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: fmul v6.4s, v17.4s, v1.4s -; CHECK-NEXT: fmul v18.4s, v4.4s, v16.4s -; CHECK-NEXT: fmul v16.4s, v5.4s, v16.4s -; CHECK-NEXT: fmul v1.4s, v3.4s, v1.4s -; CHECK-NEXT: add x10, sp, #208 -; CHECK-NEXT: ld1 { v7.s }[2], [x9] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ld1 { v19.s }[1], [x10] -; CHECK-NEXT: ld1 { v20.s }[1], [x9] +; CHECK-NEXT: ldr s17, [x11] ; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: add x10, sp, #184 -; CHECK-NEXT: fneg v6.4s, v6.4s -; CHECK-NEXT: fneg v18.4s, v18.4s -; CHECK-NEXT: fmla v16.4s, v2.4s, v4.4s -; CHECK-NEXT: fmla v1.4s, v0.4s, v17.4s -; CHECK-NEXT: ld1 { v7.s }[3], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: ld1 { v20.s }[2], [x9] -; CHECK-NEXT: ldr s4, [sp, #200] +; CHECK-NEXT: add x10, sp, #16 +; CHECK-NEXT: add x11, sp, #168 +; CHECK-NEXT: ld1 { v17.s }[1], [x9] +; CHECK-NEXT: ld1 { v2.s }[1], [x10] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: fmul v19.4s, v5.4s, v1.4s +; CHECK-NEXT: fmul v20.4s, v7.4s, v16.4s +; CHECK-NEXT: fmul v16.4s, v3.4s, v16.4s +; CHECK-NEXT: fmul v1.4s, v4.4s, v1.4s +; CHECK-NEXT: ld1 { v18.s }[1], [x9] +; CHECK-NEXT: ldr s21, [x11] +; CHECK-NEXT: zip1 v6.2d, v6.2d, v17.2d +; CHECK-NEXT: ldr s17, [sp, #192] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: add x10, sp, #208 +; CHECK-NEXT: ld1 { v21.s }[1], [x9] ; CHECK-NEXT: add x9, sp, #216 -; CHECK-NEXT: fmla v6.4s, v0.4s, v3.4s -; CHECK-NEXT: fmla v18.4s, v2.4s, v5.4s -; CHECK-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-NEXT: fsub v0.4s, v7.4s, v1.4s -; CHECK-NEXT: fsub v1.4s, v19.4s, v16.4s -; CHECK-NEXT: ld1 { v20.s }[3], [x10] -; CHECK-NEXT: fadd v2.4s, v4.4s, v18.4s -; CHECK-NEXT: fadd v3.4s, v20.4s, v6.4s +; CHECK-NEXT: fneg v19.4s, v19.4s +; CHECK-NEXT: fneg v20.4s, v20.4s +; CHECK-NEXT: fmla v16.4s, v2.4s, v7.4s +; CHECK-NEXT: fmla v1.4s, v0.4s, v5.4s +; CHECK-NEXT: ld1 { v17.s }[1], [x10] +; CHECK-NEXT: ldr s5, [sp, #200] +; CHECK-NEXT: zip1 v7.2d, v18.2d, v21.2d +; CHECK-NEXT: ld1 { v5.s }[1], [x9] +; CHECK-NEXT: fmla v19.4s, v0.4s, v4.4s +; CHECK-NEXT: fmla v20.4s, v2.4s, v3.4s +; CHECK-NEXT: fsub v0.4s, v6.4s, v1.4s +; CHECK-NEXT: fsub v1.4s, v17.4s, v16.4s +; CHECK-NEXT: fadd v2.4s, v7.4s, v19.4s +; CHECK-NEXT: fadd v3.4s, v5.4s, v20.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v1.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v2.16b, #12 -; CHECK-NEXT: trn2 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v3.16b, #12 +; CHECK-NEXT: trn2 v1.4s, v1.4s, v3.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v4.16b, #12 -; CHECK-NEXT: ext v5.16b, v3.16b, v5.16b, #8 +; CHECK-NEXT: ext v5.16b, v2.16b, v5.16b, #8 ; CHECK-NEXT: rev64 v4.4s, v4.4s -; CHECK-NEXT: trn2 v2.4s, v4.4s, v5.4s -; CHECK-NEXT: zip2 v4.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #8 -; CHECK-NEXT: mov v4.d[1], v2.d[0] +; CHECK-NEXT: trn2 v3.4s, v4.4s, v5.4s +; CHECK-NEXT: zip2 v4.4s, v0.4s, v2.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ext v1.16b, v3.16b, v1.16b, #8 +; CHECK-NEXT: mov v4.d[1], v3.d[0] ; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: stp q4, q1, [x8, #16] ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index acf15f1bd117..e6f27b95d92c 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -186,8 +186,9 @@ define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %p ; CHECK: // %bb.0: ; CHECK-NEXT: ldr s0, [x0] ; CHECK-NEXT: ld1 { v0.s }[1], [x1] -; CHECK-NEXT: ld1 { v0.s }[2], [x2] -; CHECK-NEXT: ld1 { v0.s }[3], [x3] +; CHECK-NEXT: ldr s1, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %A = load <4 x i8>, ptr %ptrA %B = load <4 x i8>, ptr %ptrB diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll index c6b8e41f9bdf..4906e2e15e51 100644 --- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll +++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll @@ -1431,6 +1431,7 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -1439,30 +1440,30 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fmaxnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fmaxnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret @@ -2012,6 +2013,7 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: add x9, sp, #16 ; FULLFP16-NEXT: // kill: def $h3 killed $h3 def $q3 ; FULLFP16-NEXT: // kill: def $h4 killed $h4 def $q4 +; FULLFP16-NEXT: add x10, sp, #40 ; FULLFP16-NEXT: // kill: def $h5 killed $h5 def $q5 ; FULLFP16-NEXT: // kill: def $h6 killed $h6 def $q6 ; FULLFP16-NEXT: // kill: def $h7 killed $h7 def $q7 @@ -2020,30 +2022,30 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) { ; FULLFP16-NEXT: ld1 { v1.h }[1], [x9] ; FULLFP16-NEXT: add x9, sp, #24 ; FULLFP16-NEXT: mov v0.h[2], v2.h[0] -; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: ld1 { v1.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #32 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: mov v0.h[3], v3.h[0] ; FULLFP16-NEXT: ld1 { v1.h }[3], [x9] -; FULLFP16-NEXT: add x9, sp, #40 -; FULLFP16-NEXT: ldr h3, [sp, #72] -; FULLFP16-NEXT: ld1 { v1.h }[4], [x9] +; FULLFP16-NEXT: ldr h2, [x10] ; FULLFP16-NEXT: add x9, sp, #48 +; FULLFP16-NEXT: ldr h3, [sp, #72] +; FULLFP16-NEXT: ld1 { v2.h }[1], [x9] +; FULLFP16-NEXT: add x9, sp, #56 ; FULLFP16-NEXT: fminnm v3.8h, v3.8h, v3.8h ; FULLFP16-NEXT: mov v0.h[4], v4.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[5], [x9] -; FULLFP16-NEXT: add x9, sp, #56 -; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h -; FULLFP16-NEXT: mov v0.h[5], v5.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[6], [x9] +; FULLFP16-NEXT: ld1 { v2.h }[2], [x9] ; FULLFP16-NEXT: add x9, sp, #64 -; FULLFP16-NEXT: str h2, [x8, #16] +; FULLFP16-NEXT: mov v0.h[5], v5.h[0] +; FULLFP16-NEXT: ld1 { v2.h }[3], [x9] +; FULLFP16-NEXT: zip1 v1.2d, v1.2d, v2.2d +; FULLFP16-NEXT: ldr h2, [sp] ; FULLFP16-NEXT: mov v0.h[6], v6.h[0] -; FULLFP16-NEXT: ld1 { v1.h }[7], [x9] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v2.8h ; FULLFP16-NEXT: fminnm v1.8h, v1.8h, v1.8h ; FULLFP16-NEXT: mov v0.h[7], v7.h[0] +; FULLFP16-NEXT: fminnm v2.8h, v2.8h, v3.8h ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v0.8h +; FULLFP16-NEXT: str h2, [x8, #16] ; FULLFP16-NEXT: fminnm v0.8h, v0.8h, v1.8h ; FULLFP16-NEXT: str q0, [x8] ; FULLFP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 4c28c9082402..ae2ef2649102 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -2509,87 +2509,88 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) { ; ; CHECK-GI-LABEL: fshl_v7i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: ldr s3, [sp, #48] -; CHECK-GI-NEXT: ldr s20, [sp, #56] -; CHECK-GI-NEXT: add x9, sp, #56 +; CHECK-GI-NEXT: ldr s17, [sp, #48] +; CHECK-GI-NEXT: add x8, sp, #56 +; CHECK-GI-NEXT: add x9, sp, #64 ; CHECK-GI-NEXT: ldr s4, [sp, #48] -; CHECK-GI-NEXT: ldr s7, [sp, #80] -; CHECK-GI-NEXT: mov w12, #-1 // =0xffffffff -; CHECK-GI-NEXT: ldr s21, [sp, #88] -; CHECK-GI-NEXT: mov v3.s[1], v20.s[0] -; CHECK-GI-NEXT: fmov s20, w12 -; CHECK-GI-NEXT: ld1 { v4.s }[1], [x9] -; CHECK-GI-NEXT: ldr s17, [sp] -; CHECK-GI-NEXT: add x13, sp, #64 -; CHECK-GI-NEXT: mov v7.s[1], v21.s[0] +; CHECK-GI-NEXT: ldr s21, [sp, #56] +; CHECK-GI-NEXT: mov w10, #-1 // =0xffffffff +; CHECK-GI-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-GI-NEXT: ldr s20, [x9] +; CHECK-GI-NEXT: add x8, sp, #72 +; CHECK-GI-NEXT: mov v4.s[1], v21.s[0] ; CHECK-GI-NEXT: fmov s21, w7 +; CHECK-GI-NEXT: ldr s6, [sp] +; CHECK-GI-NEXT: ld1 { v20.s }[1], [x8] ; CHECK-GI-NEXT: ldr s19, [sp, #64] -; CHECK-GI-NEXT: mov w11, #31 // =0x1f -; CHECK-GI-NEXT: mov v20.s[1], w12 +; CHECK-GI-NEXT: ldr s7, [sp, #80] +; CHECK-GI-NEXT: ldr s22, [sp, #88] +; CHECK-GI-NEXT: mov w9, #31 // =0x1f +; CHECK-GI-NEXT: mov w11, #1 // =0x1 +; CHECK-GI-NEXT: mov v21.s[1], v6.s[0] +; CHECK-GI-NEXT: fmov s6, w9 ; CHECK-GI-NEXT: ldr s18, [sp, #96] -; CHECK-GI-NEXT: ld1 { v4.s }[2], [x13] -; CHECK-GI-NEXT: mov w13, #1 // =0x1 -; CHECK-GI-NEXT: mov v3.s[2], v19.s[0] -; CHECK-GI-NEXT: mov v21.s[1], v17.s[0] -; CHECK-GI-NEXT: fmov s17, w11 -; CHECK-GI-NEXT: fmov s19, w13 +; CHECK-GI-NEXT: zip1 v17.2d, v17.2d, v20.2d +; CHECK-GI-NEXT: fmov s20, w10 +; CHECK-GI-NEXT: mov v7.s[1], v22.s[0] +; CHECK-GI-NEXT: mov v4.s[2], v19.s[0] +; CHECK-GI-NEXT: fmov s19, w11 ; CHECK-GI-NEXT: fmov s23, w0 -; CHECK-GI-NEXT: fmov s24, w11 -; CHECK-GI-NEXT: ldr s6, [sp, #8] +; CHECK-GI-NEXT: mov v6.s[1], w9 +; CHECK-GI-NEXT: fmov s24, w9 +; CHECK-GI-NEXT: ldr s2, [sp, #8] +; CHECK-GI-NEXT: mov v20.s[1], w10 ; CHECK-GI-NEXT: ldr s0, [sp, #24] ; CHECK-GI-NEXT: ldr s5, [sp, #32] +; CHECK-GI-NEXT: mov v19.s[1], w11 ; CHECK-GI-NEXT: mov v7.s[2], v18.s[0] -; CHECK-GI-NEXT: mov v17.s[1], w11 -; CHECK-GI-NEXT: mov v19.s[1], w13 -; CHECK-GI-NEXT: mov v20.s[2], w12 ; CHECK-GI-NEXT: ldr s16, [sp, #72] ; CHECK-GI-NEXT: mov v23.s[1], w1 ; CHECK-GI-NEXT: ldr s18, [sp, #80] -; CHECK-GI-NEXT: mov v21.s[2], v6.s[0] -; CHECK-GI-NEXT: mov v24.s[1], w11 +; CHECK-GI-NEXT: mov v21.s[2], v2.s[0] +; CHECK-GI-NEXT: mov v24.s[1], w9 ; CHECK-GI-NEXT: mov v0.s[1], v5.s[0] -; CHECK-GI-NEXT: fmov s6, w4 -; CHECK-GI-NEXT: add x10, sp, #88 +; CHECK-GI-NEXT: fmov s5, w4 +; CHECK-GI-NEXT: mov v20.s[2], w10 +; CHECK-GI-NEXT: add x8, sp, #88 ; CHECK-GI-NEXT: movi v22.4s, #31 -; CHECK-GI-NEXT: mov v3.s[3], v16.s[0] -; CHECK-GI-NEXT: mov v17.s[2], w11 -; CHECK-GI-NEXT: mov v19.s[2], w13 -; CHECK-GI-NEXT: ldr s2, [sp, #16] -; CHECK-GI-NEXT: ldr s1, [sp, #40] -; CHECK-GI-NEXT: ld1 { v18.s }[1], [x10] -; CHECK-GI-NEXT: eor v5.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v4.s[3], v16.s[0] +; CHECK-GI-NEXT: mov v6.s[2], w9 +; CHECK-GI-NEXT: mov v19.s[2], w11 +; CHECK-GI-NEXT: ldr s1, [sp, #16] +; CHECK-GI-NEXT: ldr s3, [sp, #40] +; CHECK-GI-NEXT: ld1 { v18.s }[1], [x8] ; CHECK-GI-NEXT: mov v23.s[2], w2 -; CHECK-GI-NEXT: mov v6.s[1], w5 -; CHECK-GI-NEXT: add x8, sp, #72 -; CHECK-GI-NEXT: add x9, sp, #96 -; CHECK-GI-NEXT: mov v21.s[3], v2.s[0] -; CHECK-GI-NEXT: mov v24.s[2], w11 -; CHECK-GI-NEXT: mov v0.s[2], v1.s[0] -; CHECK-GI-NEXT: ld1 { v4.s }[3], [x8] -; CHECK-GI-NEXT: bic v2.16b, v22.16b, v3.16b -; CHECK-GI-NEXT: ld1 { v18.s }[2], [x9] -; CHECK-GI-NEXT: and v1.16b, v5.16b, v17.16b +; CHECK-GI-NEXT: mov v5.s[1], w5 +; CHECK-GI-NEXT: add x8, sp, #96 +; CHECK-GI-NEXT: eor v2.16b, v7.16b, v20.16b +; CHECK-GI-NEXT: mov v21.s[3], v1.s[0] +; CHECK-GI-NEXT: mov v24.s[2], w9 +; CHECK-GI-NEXT: mov v0.s[2], v3.s[0] +; CHECK-GI-NEXT: bic v1.16b, v22.16b, v4.16b +; CHECK-GI-NEXT: ld1 { v18.s }[2], [x8] ; CHECK-GI-NEXT: neg v3.4s, v19.4s +; CHECK-GI-NEXT: and v4.16b, v17.16b, v22.16b +; CHECK-GI-NEXT: and v2.16b, v2.16b, v6.16b ; CHECK-GI-NEXT: mov v23.s[3], w3 -; CHECK-GI-NEXT: mov v6.s[2], w6 -; CHECK-GI-NEXT: and v4.16b, v4.16b, v22.16b -; CHECK-GI-NEXT: ushr v5.4s, v21.4s, #1 -; CHECK-GI-NEXT: neg v2.4s, v2.4s -; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b +; CHECK-GI-NEXT: mov v5.s[2], w6 +; CHECK-GI-NEXT: ushr v6.4s, v21.4s, #1 ; CHECK-GI-NEXT: neg v1.4s, v1.4s +; CHECK-GI-NEXT: and v7.16b, v18.16b, v24.16b ; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v3.4s +; CHECK-GI-NEXT: neg v2.4s, v2.4s ; CHECK-GI-NEXT: ushl v3.4s, v23.4s, v4.4s -; CHECK-GI-NEXT: ushl v2.4s, v5.4s, v2.4s -; CHECK-GI-NEXT: ushl v4.4s, v6.4s, v7.4s -; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v1.4s -; CHECK-GI-NEXT: orr v1.16b, v3.16b, v2.16b +; CHECK-GI-NEXT: ushl v1.4s, v6.4s, v1.4s +; CHECK-GI-NEXT: ushl v4.4s, v5.4s, v7.4s +; CHECK-GI-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-GI-NEXT: orr v1.16b, v3.16b, v1.16b ; CHECK-GI-NEXT: orr v0.16b, v4.16b, v0.16b ; CHECK-GI-NEXT: mov s2, v1.s[1] ; CHECK-GI-NEXT: mov s3, v1.s[2] ; CHECK-GI-NEXT: mov s4, v1.s[3] +; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: mov s5, v0.s[1] ; CHECK-GI-NEXT: mov s6, v0.s[2] -; CHECK-GI-NEXT: fmov w0, s1 ; CHECK-GI-NEXT: fmov w4, s0 ; CHECK-GI-NEXT: fmov w1, s2 ; CHECK-GI-NEXT: fmov w2, s3 diff --git a/llvm/test/CodeGen/AArch64/icmp-cst.ll b/llvm/test/CodeGen/AArch64/icmp-cst.ll index b6f452bb42ce..b75e3535bf82 100644 --- a/llvm/test/CodeGen/AArch64/icmp-cst.ll +++ b/llvm/test/CodeGen/AArch64/icmp-cst.ll @@ -1,687 +1,415 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 < %s | FileCheck %s --check-prefix=CHECK-SD -; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=1 < %s | FileCheck %s --check-prefix=CHECK-GI +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD +; RUN: llc -mtriple=aarch64-linux-gnu -global-isel=1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI define i1 @ule_11111111(i32 noundef %in) { -; CHECK-SD-LABEL: ule_11111111: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #4370 // =0x1112 -; CHECK-SD-NEXT: movk w8, #4369, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_11111111: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #286331153 // =0x11111111 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_11111111: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #286331153 // =0x11111111 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 286331154 ret i1 %out } define i1 @ule_22222222(i32 noundef %in) { -; CHECK-SD-LABEL: ule_22222222: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #8739 // =0x2223 -; CHECK-SD-NEXT: movk w8, #8738, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_22222222: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #572662306 // =0x22222222 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_22222222: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #572662306 // =0x22222222 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 572662307 ret i1 %out } define i1 @ule_33333333(i32 noundef %in) { -; CHECK-SD-LABEL: ule_33333333: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #13108 // =0x3334 -; CHECK-SD-NEXT: movk w8, #13107, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_33333333: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #858993459 // =0x33333333 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_33333333: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #858993459 // =0x33333333 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 858993460 ret i1 %out } define i1 @ule_44444444(i32 noundef %in) { -; CHECK-SD-LABEL: ule_44444444: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #17477 // =0x4445 -; CHECK-SD-NEXT: movk w8, #17476, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_44444444: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1145324612 // =0x44444444 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_44444444: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1145324612 // =0x44444444 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 1145324613 ret i1 %out } define i1 @ule_55555555(i32 noundef %in) { -; CHECK-SD-LABEL: ule_55555555: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #21846 // =0x5556 -; CHECK-SD-NEXT: movk w8, #21845, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_55555555: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1431655765 // =0x55555555 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_55555555: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1431655765 // =0x55555555 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 1431655766 ret i1 %out } define i1 @ule_66666666(i32 noundef %in) { -; CHECK-SD-LABEL: ule_66666666: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #26215 // =0x6667 -; CHECK-SD-NEXT: movk w8, #26214, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_66666666: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1717986918 // =0x66666666 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_66666666: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1717986918 // =0x66666666 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 1717986919 ret i1 %out } define i1 @ule_77777777(i32 noundef %in) { -; CHECK-SD-LABEL: ule_77777777: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #30584 // =0x7778 -; CHECK-SD-NEXT: movk w8, #30583, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_77777777: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #2004318071 // =0x77777777 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_77777777: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2004318071 // =0x77777777 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, 2004318072 ret i1 %out } define i1 @ule_88888888(i32 noundef %in) { -; CHECK-SD-LABEL: ule_88888888: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #34953 // =0x8889 -; CHECK-SD-NEXT: movk w8, #34952, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_88888888: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-2004318072 // =0x88888888 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_88888888: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2004318072 // =0x88888888 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, -2004318071 ret i1 %out } define i1 @ule_99999999(i32 noundef %in) { -; CHECK-SD-LABEL: ule_99999999: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #39322 // =0x999a -; CHECK-SD-NEXT: movk w8, #39321, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: ule_99999999: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-1717986919 // =0x99999999 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: ule_99999999: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1717986919 // =0x99999999 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, -1717986918 ret i1 %out } define i1 @uge_11111111(i32 noundef %in) { -; CHECK-SD-LABEL: uge_11111111: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #4368 // =0x1110 -; CHECK-SD-NEXT: movk w8, #4369, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_11111111: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #286331153 // =0x11111111 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_11111111: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #286331153 // =0x11111111 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 286331152 ret i1 %out } define i1 @uge_22222222(i32 noundef %in) { -; CHECK-SD-LABEL: uge_22222222: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #8737 // =0x2221 -; CHECK-SD-NEXT: movk w8, #8738, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_22222222: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #572662306 // =0x22222222 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_22222222: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #572662306 // =0x22222222 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 572662305 ret i1 %out } define i1 @uge_33333333(i32 noundef %in) { -; CHECK-SD-LABEL: uge_33333333: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #13106 // =0x3332 -; CHECK-SD-NEXT: movk w8, #13107, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_33333333: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #858993459 // =0x33333333 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_33333333: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #858993459 // =0x33333333 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 858993458 ret i1 %out } define i1 @uge_44444444(i32 noundef %in) { -; CHECK-SD-LABEL: uge_44444444: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #17475 // =0x4443 -; CHECK-SD-NEXT: movk w8, #17476, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_44444444: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1145324612 // =0x44444444 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_44444444: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1145324612 // =0x44444444 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 1145324611 ret i1 %out } define i1 @uge_55555555(i32 noundef %in) { -; CHECK-SD-LABEL: uge_55555555: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #21844 // =0x5554 -; CHECK-SD-NEXT: movk w8, #21845, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_55555555: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1431655765 // =0x55555555 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_55555555: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1431655765 // =0x55555555 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 1431655764 ret i1 %out } define i1 @uge_66666666(i32 noundef %in) { -; CHECK-SD-LABEL: uge_66666666: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #26213 // =0x6665 -; CHECK-SD-NEXT: movk w8, #26214, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_66666666: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1717986918 // =0x66666666 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_66666666: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1717986918 // =0x66666666 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 1717986917 ret i1 %out } define i1 @uge_77777777(i32 noundef %in) { -; CHECK-SD-LABEL: uge_77777777: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #30582 // =0x7776 -; CHECK-SD-NEXT: movk w8, #30583, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_77777777: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #2004318071 // =0x77777777 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_77777777: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2004318071 // =0x77777777 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, 2004318070 ret i1 %out } define i1 @uge_88888888(i32 noundef %in) { -; CHECK-SD-LABEL: uge_88888888: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #34951 // =0x8887 -; CHECK-SD-NEXT: movk w8, #34952, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_88888888: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-2004318072 // =0x88888888 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_88888888: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2004318072 // =0x88888888 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, -2004318073 ret i1 %out } define i1 @uge_99999999(i32 noundef %in) { -; CHECK-SD-LABEL: uge_99999999: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #39320 // =0x9998 -; CHECK-SD-NEXT: movk w8, #39321, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: uge_99999999: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-1717986919 // =0x99999999 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: uge_99999999: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1717986919 // =0x99999999 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, -1717986920 ret i1 %out } define i1 @sle_11111111(i32 noundef %in) { -; CHECK-SD-LABEL: sle_11111111: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #4370 // =0x1112 -; CHECK-SD-NEXT: movk w8, #4369, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_11111111: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #286331153 // =0x11111111 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_11111111: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #286331153 // =0x11111111 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 286331154 ret i1 %out } define i1 @sle_22222222(i32 noundef %in) { -; CHECK-SD-LABEL: sle_22222222: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #8739 // =0x2223 -; CHECK-SD-NEXT: movk w8, #8738, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_22222222: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #572662306 // =0x22222222 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_22222222: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #572662306 // =0x22222222 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 572662307 ret i1 %out } define i1 @sle_33333333(i32 noundef %in) { -; CHECK-SD-LABEL: sle_33333333: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #13108 // =0x3334 -; CHECK-SD-NEXT: movk w8, #13107, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_33333333: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #858993459 // =0x33333333 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_33333333: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #858993459 // =0x33333333 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 858993460 ret i1 %out } define i1 @sle_44444444(i32 noundef %in) { -; CHECK-SD-LABEL: sle_44444444: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #17477 // =0x4445 -; CHECK-SD-NEXT: movk w8, #17476, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_44444444: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1145324612 // =0x44444444 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_44444444: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1145324612 // =0x44444444 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 1145324613 ret i1 %out } define i1 @sle_55555555(i32 noundef %in) { -; CHECK-SD-LABEL: sle_55555555: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #21846 // =0x5556 -; CHECK-SD-NEXT: movk w8, #21845, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_55555555: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1431655765 // =0x55555555 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_55555555: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1431655765 // =0x55555555 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 1431655766 ret i1 %out } define i1 @sle_66666666(i32 noundef %in) { -; CHECK-SD-LABEL: sle_66666666: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #26215 // =0x6667 -; CHECK-SD-NEXT: movk w8, #26214, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_66666666: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1717986918 // =0x66666666 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_66666666: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1717986918 // =0x66666666 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 1717986919 ret i1 %out } define i1 @sle_77777777(i32 noundef %in) { -; CHECK-SD-LABEL: sle_77777777: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #30584 // =0x7778 -; CHECK-SD-NEXT: movk w8, #30583, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_77777777: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #2004318071 // =0x77777777 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_77777777: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2004318071 // =0x77777777 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %out = icmp slt i32 %in, 2004318072 ret i1 %out } define i1 @sle_88888888(i32 noundef %in) { -; CHECK-SD-LABEL: sle_88888888: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #34953 // =0x8889 -; CHECK-SD-NEXT: movk w8, #34952, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_88888888: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-2004318072 // =0x88888888 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_88888888: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2004318072 // =0x88888888 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, -2004318071 ret i1 %out } define i1 @sle_99999999(i32 noundef %in) { -; CHECK-SD-LABEL: sle_99999999: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #39322 // =0x999a -; CHECK-SD-NEXT: movk w8, #39321, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, lo -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sle_99999999: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-1717986919 // =0x99999999 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sle_99999999: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1717986919 // =0x99999999 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %out = icmp ult i32 %in, -1717986918 ret i1 %out } define i1 @sge_11111111(i32 noundef %in) { -; CHECK-SD-LABEL: sge_11111111: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #4368 // =0x1110 -; CHECK-SD-NEXT: movk w8, #4369, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_11111111: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #286331153 // =0x11111111 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_11111111: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #286331153 // =0x11111111 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 286331152 ret i1 %out } define i1 @sge_22222222(i32 noundef %in) { -; CHECK-SD-LABEL: sge_22222222: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #8737 // =0x2221 -; CHECK-SD-NEXT: movk w8, #8738, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_22222222: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #572662306 // =0x22222222 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_22222222: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #572662306 // =0x22222222 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 572662305 ret i1 %out } define i1 @sge_33333333(i32 noundef %in) { -; CHECK-SD-LABEL: sge_33333333: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #13106 // =0x3332 -; CHECK-SD-NEXT: movk w8, #13107, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_33333333: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #858993459 // =0x33333333 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_33333333: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #858993459 // =0x33333333 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 858993458 ret i1 %out } define i1 @sge_44444444(i32 noundef %in) { -; CHECK-SD-LABEL: sge_44444444: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #17475 // =0x4443 -; CHECK-SD-NEXT: movk w8, #17476, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_44444444: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1145324612 // =0x44444444 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_44444444: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1145324612 // =0x44444444 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 1145324611 ret i1 %out } define i1 @sge_55555555(i32 noundef %in) { -; CHECK-SD-LABEL: sge_55555555: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #21844 // =0x5554 -; CHECK-SD-NEXT: movk w8, #21845, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_55555555: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1431655765 // =0x55555555 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_55555555: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1431655765 // =0x55555555 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 1431655764 ret i1 %out } define i1 @sge_66666666(i32 noundef %in) { -; CHECK-SD-LABEL: sge_66666666: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #26213 // =0x6665 -; CHECK-SD-NEXT: movk w8, #26214, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_66666666: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #1717986918 // =0x66666666 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_66666666: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #1717986918 // =0x66666666 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 1717986917 ret i1 %out } define i1 @sge_77777777(i32 noundef %in) { -; CHECK-SD-LABEL: sge_77777777: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #30582 // =0x7776 -; CHECK-SD-NEXT: movk w8, #30583, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_77777777: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #2004318071 // =0x77777777 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_77777777: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #2004318071 // =0x77777777 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %out = icmp sgt i32 %in, 2004318070 ret i1 %out } define i1 @sge_88888888(i32 noundef %in) { -; CHECK-SD-LABEL: sge_88888888: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #34951 // =0x8887 -; CHECK-SD-NEXT: movk w8, #34952, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_88888888: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-2004318072 // =0x88888888 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_88888888: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-2004318072 // =0x88888888 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, -2004318073 ret i1 %out } define i1 @sge_99999999(i32 noundef %in) { -; CHECK-SD-LABEL: sge_99999999: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: mov w8, #39320 // =0x9998 -; CHECK-SD-NEXT: movk w8, #39321, lsl #16 -; CHECK-SD-NEXT: cmp w0, w8 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: sge_99999999: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mov w8, #-1717986919 // =0x99999999 -; CHECK-GI-NEXT: cmp w0, w8 -; CHECK-GI-NEXT: cset w0, hs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: sge_99999999: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, #-1717986919 // =0x99999999 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: cset w0, hs +; CHECK-NEXT: ret %out = icmp ugt i32 %in, -1717986920 ret i1 %out } + +define i1 @ult_20014852997121(i64 noundef %in) { +; CHECK-LABEL: ult_20014852997121: +; CHECK: // %bb.0: +; CHECK-NEXT: mov x8, #305397760 // =0x12340000 +; CHECK-NEXT: movk x8, #4660, lsl #32 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret + %out = icmp ult i64 %in, 20014852997121 + ret i1 %out +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-GI: {{.*}} +; CHECK-SD: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir index a10d7588cb44..8a5e0f6aa843 100644 --- a/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir +++ b/llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir @@ -756,7 +756,7 @@ body: | ; CHECK: liveins: $x0, $x1, $x2 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $x1, renamable $x0 = LDRSWpre renamable $x1, 40, implicit $w1, implicit $w1_hi :: (load (s32)) - ; CHECK-NEXT: renamable $w2 = LDRWui renamable $x1, 1, implicit-def $x2, implicit $w2_hi :: (load (s32)) + ; CHECK-NEXT: renamable $w2 = LDRWui renamable $x1, 1, implicit-def $x2 :: (load (s32)) ; CHECK-NEXT: STPXi renamable $x0, renamable $x2, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $x0 = LDRSWpre killed renamable $x1, 40 :: (load (s32)) diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll index 2213aa1429db..4e1876db772e 100644 --- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll +++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll @@ -700,13 +700,14 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi ; CHECK-NEXT: ldr s1, [sp, #44] ; CHECK-NEXT: ldr q2, [sp] // 16-byte Folded Reload ; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-NEXT: ld1 { v1.s }[1], [x19] ; CHECK-NEXT: mov v2.s[3], v0.s[0] -; CHECK-NEXT: ld1 { v1.s }[2], [x20] +; CHECK-NEXT: ld1 { v1.s }[1], [x19] +; CHECK-NEXT: ldr s0, [x20] +; CHECK-NEXT: ld1 { v0.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload -; CHECK-NEXT: mov v0.16b, v2.16b -; CHECK-NEXT: ld1 { v1.s }[3], [x21] ; CHECK-NEXT: ldp x30, x21, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: zip1 v1.2d, v1.2d, v0.2d +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret ; @@ -872,10 +873,11 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind { ; CHECK-NEXT: bl frexpf ; CHECK-NEXT: ldr s0, [sp, #28] ; CHECK-NEXT: ld1 { v0.s }[1], [x19] -; CHECK-NEXT: ld1 { v0.s }[2], [x20] +; CHECK-NEXT: ldr s1, [x20] +; CHECK-NEXT: ld1 { v1.s }[1], [x21] ; CHECK-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload -; CHECK-NEXT: ld1 { v0.s }[3], [x21] ; CHECK-NEXT: ldp x30, x21, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d ; CHECK-NEXT: add sp, sp, #64 ; CHECK-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 048e988b6c49..88b6f6c40bac 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -8062,195 +8062,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> % ; CHECK-SD-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill ; CHECK-SD-NEXT: .cfi_def_cfa_offset 16 ; CHECK-SD-NEXT: .cfi_offset w29, -16 -; CHECK-SD-NEXT: ldr b5, [sp, #208] +; CHECK-SD-NEXT: ldr b0, [sp, #208] ; CHECK-SD-NEXT: add x8, sp, #216 -; CHECK-SD-NEXT: fmov s0, w0 +; CHECK-SD-NEXT: add x9, sp, #272 +; CHECK-SD-NEXT: ldr b2, [sp, #80] ; CHECK-SD-NEXT: ldr b4, [sp, #976] -; CHECK-SD-NEXT: add x9, sp, #984 -; CHECK-SD-NEXT: add x12, sp, #328 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] -; CHECK-SD-NEXT: add x8, sp, #224 -; CHECK-SD-NEXT: movi v1.16b, #1 -; CHECK-SD-NEXT: mov v0.b[1], w1 -; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] -; CHECK-SD-NEXT: movi v3.2d, #0000000000000000 -; CHECK-SD-NEXT: add x11, sp, #992 ; CHECK-SD-NEXT: ldr b6, [sp, #720] -; CHECK-SD-NEXT: ldr b7, [sp, #80] -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #224 +; CHECK-SD-NEXT: fmov s16, w0 +; CHECK-SD-NEXT: ldr b17, [sp, #848] +; CHECK-SD-NEXT: add x10, sp, #24 +; CHECK-SD-NEXT: movi v19.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #232 -; CHECK-SD-NEXT: add x13, sp, #88 -; CHECK-SD-NEXT: ld1 { v4.b }[2], [x11] -; CHECK-SD-NEXT: ld1 { v7.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #856 -; CHECK-SD-NEXT: mov v0.b[2], w2 -; CHECK-SD-NEXT: add x14, sp, #1008 -; CHECK-SD-NEXT: add x15, sp, #872 -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: mov v16.b[1], w1 +; CHECK-SD-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #240 -; CHECK-SD-NEXT: add x16, sp, #888 -; CHECK-SD-NEXT: add x10, sp, #16 -; CHECK-SD-NEXT: add x9, sp, #24 -; CHECK-SD-NEXT: add x11, sp, #40 -; CHECK-SD-NEXT: movi v2.2d, #0000000000000000 -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: mov v16.b[2], w2 +; CHECK-SD-NEXT: ld1 { v0.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #248 -; CHECK-SD-NEXT: mov v0.b[3], w3 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: mov v16.b[3], w3 +; CHECK-SD-NEXT: ld1 { v0.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #256 -; CHECK-SD-NEXT: mov v0.b[4], w4 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: ld1 { v0.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #264 -; CHECK-SD-NEXT: mov v0.b[5], w5 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] -; CHECK-SD-NEXT: add x8, sp, #272 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x8] +; CHECK-SD-NEXT: mov v16.b[4], w4 +; CHECK-SD-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-SD-NEXT: ldr b1, [x9] ; CHECK-SD-NEXT: add x8, sp, #280 -; CHECK-SD-NEXT: mov v0.b[6], w6 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x8] +; CHECK-SD-NEXT: add x9, sp, #88 +; CHECK-SD-NEXT: mov v16.b[5], w5 +; CHECK-SD-NEXT: ld1 { v1.b }[1], [x8] ; CHECK-SD-NEXT: add x8, sp, #288 -; CHECK-SD-NEXT: mov v0.b[7], w7 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[2], [x8] ; CHECK-SD-NEXT: add x8, sp, #296 -; CHECK-SD-NEXT: ld1 { v0.b }[8], [x10] -; CHECK-SD-NEXT: add x10, sp, #128 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] +; CHECK-SD-NEXT: mov v16.b[6], w6 +; CHECK-SD-NEXT: ld1 { v1.b }[3], [x8] ; CHECK-SD-NEXT: add x8, sp, #304 -; CHECK-SD-NEXT: ld1 { v0.b }[9], [x9] -; CHECK-SD-NEXT: add x9, sp, #136 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x8] +; CHECK-SD-NEXT: mov v16.b[7], w7 +; CHECK-SD-NEXT: ld1 { v1.b }[4], [x8] ; CHECK-SD-NEXT: add x8, sp, #312 -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] +; CHECK-SD-NEXT: ld1 { v1.b }[5], [x8] ; CHECK-SD-NEXT: add x8, sp, #320 -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #32 -; CHECK-SD-NEXT: ld1 { v0.b }[10], [x8] -; CHECK-SD-NEXT: add x8, sp, #144 -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x12] -; CHECK-SD-NEXT: add x12, sp, #728 -; CHECK-SD-NEXT: ld1 { v6.b }[1], [x12] -; CHECK-SD-NEXT: add x12, sp, #1000 -; CHECK-SD-NEXT: ld1 { v0.b }[11], [x11] -; CHECK-SD-NEXT: ld1 { v4.b }[3], [x12] -; CHECK-SD-NEXT: add x12, sp, #736 -; CHECK-SD-NEXT: add x11, sp, #920 -; CHECK-SD-NEXT: sdot v3.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: ldr b5, [sp, #848] -; CHECK-SD-NEXT: ld1 { v6.b }[2], [x12] -; CHECK-SD-NEXT: add x12, sp, #48 -; CHECK-SD-NEXT: ld1 { v5.b }[1], [x13] -; CHECK-SD-NEXT: add x13, sp, #744 -; CHECK-SD-NEXT: ld1 { v4.b }[4], [x14] -; CHECK-SD-NEXT: add x14, sp, #96 -; CHECK-SD-NEXT: ld1 { v0.b }[12], [x12] -; CHECK-SD-NEXT: ld1 { v6.b }[3], [x13] -; CHECK-SD-NEXT: add x13, sp, #864 -; CHECK-SD-NEXT: ld1 { v7.b }[2], [x14] -; CHECK-SD-NEXT: add x14, sp, #1016 -; CHECK-SD-NEXT: ld1 { v5.b }[2], [x13] -; CHECK-SD-NEXT: add x13, sp, #752 -; CHECK-SD-NEXT: ld1 { v4.b }[5], [x14] -; CHECK-SD-NEXT: add x14, sp, #104 -; CHECK-SD-NEXT: ld1 { v6.b }[4], [x13] -; CHECK-SD-NEXT: add x13, sp, #1024 -; CHECK-SD-NEXT: ld1 { v7.b }[3], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[3], [x15] -; CHECK-SD-NEXT: add x15, sp, #760 -; CHECK-SD-NEXT: add x14, sp, #112 -; CHECK-SD-NEXT: ld1 { v4.b }[6], [x13] -; CHECK-SD-NEXT: add x13, sp, #880 -; CHECK-SD-NEXT: ld1 { v6.b }[5], [x15] -; CHECK-SD-NEXT: add x15, sp, #1032 -; CHECK-SD-NEXT: ld1 { v7.b }[4], [x14] -; CHECK-SD-NEXT: ld1 { v5.b }[4], [x13] -; CHECK-SD-NEXT: add x14, sp, #768 -; CHECK-SD-NEXT: add x13, sp, #120 -; CHECK-SD-NEXT: ld1 { v4.b }[7], [x15] -; CHECK-SD-NEXT: add x15, sp, #1040 -; CHECK-SD-NEXT: ld1 { v6.b }[6], [x14] -; CHECK-SD-NEXT: ld1 { v7.b }[5], [x13] -; CHECK-SD-NEXT: add x13, sp, #776 -; CHECK-SD-NEXT: ld1 { v5.b }[5], [x16] -; CHECK-SD-NEXT: add x14, sp, #1048 -; CHECK-SD-NEXT: ld1 { v4.b }[8], [x15] -; CHECK-SD-NEXT: add x15, sp, #896 -; CHECK-SD-NEXT: ld1 { v6.b }[7], [x13] -; CHECK-SD-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-SD-NEXT: add x10, sp, #784 -; CHECK-SD-NEXT: ld1 { v5.b }[6], [x15] -; CHECK-SD-NEXT: add x13, sp, #1056 -; CHECK-SD-NEXT: ld1 { v4.b }[9], [x14] -; CHECK-SD-NEXT: add x14, sp, #904 -; CHECK-SD-NEXT: ld1 { v6.b }[8], [x10] -; CHECK-SD-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-SD-NEXT: add x9, sp, #792 -; CHECK-SD-NEXT: ld1 { v5.b }[7], [x14] -; CHECK-SD-NEXT: add x10, sp, #1064 -; CHECK-SD-NEXT: ld1 { v4.b }[10], [x13] -; CHECK-SD-NEXT: add x13, sp, #912 -; CHECK-SD-NEXT: ld1 { v6.b }[9], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[8], [x8] -; CHECK-SD-NEXT: add x9, sp, #800 -; CHECK-SD-NEXT: ld1 { v5.b }[8], [x13] +; CHECK-SD-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #328 +; CHECK-SD-NEXT: ld1 { v1.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #96 +; CHECK-SD-NEXT: add x9, sp, #144 +; CHECK-SD-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #104 +; CHECK-SD-NEXT: zip1 v0.2d, v0.2d, v1.2d +; CHECK-SD-NEXT: movi v1.16b, #1 +; CHECK-SD-NEXT: ld1 { v2.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #112 +; CHECK-SD-NEXT: ld1 { v2.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #120 +; CHECK-SD-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #128 +; CHECK-SD-NEXT: ld1 { v2.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #136 +; CHECK-SD-NEXT: ld1 { v2.b }[7], [x8] +; CHECK-SD-NEXT: ldr b3, [x9] ; CHECK-SD-NEXT: add x8, sp, #152 -; CHECK-SD-NEXT: ld1 { v4.b }[11], [x10] -; CHECK-SD-NEXT: add x10, sp, #1072 -; CHECK-SD-NEXT: ld1 { v6.b }[10], [x9] -; CHECK-SD-NEXT: ld1 { v7.b }[9], [x8] -; CHECK-SD-NEXT: add x9, sp, #808 -; CHECK-SD-NEXT: ld1 { v5.b }[9], [x11] -; CHECK-SD-NEXT: add x8, sp, #56 -; CHECK-SD-NEXT: ld1 { v4.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #160 -; CHECK-SD-NEXT: ld1 { v0.b }[13], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #928 -; CHECK-SD-NEXT: ld1 { v7.b }[10], [x10] -; CHECK-SD-NEXT: add x10, sp, #1080 -; CHECK-SD-NEXT: ld1 { v5.b }[10], [x9] -; CHECK-SD-NEXT: add x8, sp, #816 -; CHECK-SD-NEXT: ld1 { v4.b }[13], [x10] -; CHECK-SD-NEXT: add x9, sp, #168 -; CHECK-SD-NEXT: add x10, sp, #176 -; CHECK-SD-NEXT: ld1 { v6.b }[12], [x8] -; CHECK-SD-NEXT: add x8, sp, #936 -; CHECK-SD-NEXT: ld1 { v7.b }[11], [x9] -; CHECK-SD-NEXT: add x9, sp, #1088 -; CHECK-SD-NEXT: ld1 { v5.b }[11], [x8] -; CHECK-SD-NEXT: add x8, sp, #64 -; CHECK-SD-NEXT: ld1 { v4.b }[14], [x9] -; CHECK-SD-NEXT: add x9, sp, #824 -; CHECK-SD-NEXT: ld1 { v0.b }[14], [x8] -; CHECK-SD-NEXT: ld1 { v6.b }[13], [x9] -; CHECK-SD-NEXT: add x9, sp, #944 -; CHECK-SD-NEXT: ld1 { v7.b }[12], [x10] -; CHECK-SD-NEXT: add x10, sp, #1096 -; CHECK-SD-NEXT: ld1 { v5.b }[12], [x9] -; CHECK-SD-NEXT: add x8, sp, #832 -; CHECK-SD-NEXT: ld1 { v4.b }[15], [x10] -; CHECK-SD-NEXT: add x9, sp, #184 -; CHECK-SD-NEXT: add x10, sp, #72 -; CHECK-SD-NEXT: ld1 { v6.b }[14], [x8] -; CHECK-SD-NEXT: add x8, sp, #952 -; CHECK-SD-NEXT: ld1 { v7.b }[13], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[13], [x8] -; CHECK-SD-NEXT: add x8, sp, #840 -; CHECK-SD-NEXT: ld1 { v0.b }[15], [x10] -; CHECK-SD-NEXT: sdot v2.4s, v4.16b, v1.16b -; CHECK-SD-NEXT: add x9, sp, #192 -; CHECK-SD-NEXT: ld1 { v6.b }[15], [x8] -; CHECK-SD-NEXT: add x8, sp, #960 -; CHECK-SD-NEXT: ld1 { v7.b }[14], [x9] -; CHECK-SD-NEXT: ld1 { v5.b }[14], [x8] -; CHECK-SD-NEXT: sdot v3.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: add x9, sp, #984 +; CHECK-SD-NEXT: ld1 { v3.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #160 +; CHECK-SD-NEXT: ld1 { v3.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #168 +; CHECK-SD-NEXT: ld1 { v3.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #176 +; CHECK-SD-NEXT: ld1 { v3.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #184 +; CHECK-SD-NEXT: ld1 { v3.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #192 +; CHECK-SD-NEXT: ld1 { v3.b }[6], [x8] ; CHECK-SD-NEXT: add x8, sp, #200 -; CHECK-SD-NEXT: add x9, sp, #968 -; CHECK-SD-NEXT: sdot v2.4s, v6.16b, v1.16b -; CHECK-SD-NEXT: ld1 { v7.b }[15], [x8] -; CHECK-SD-NEXT: ld1 { v5.b }[15], [x9] -; CHECK-SD-NEXT: sdot v3.4s, v7.16b, v1.16b -; CHECK-SD-NEXT: sdot v2.4s, v5.16b, v1.16b -; CHECK-SD-NEXT: add v0.4s, v3.4s, v2.4s +; CHECK-SD-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v4.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #992 +; CHECK-SD-NEXT: add x9, sp, #1040 +; CHECK-SD-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1000 +; CHECK-SD-NEXT: zip1 v2.2d, v2.2d, v3.2d +; CHECK-SD-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1008 +; CHECK-SD-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1016 +; CHECK-SD-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1024 +; CHECK-SD-NEXT: ld1 { v4.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1032 +; CHECK-SD-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-SD-NEXT: ldr b5, [x9] +; CHECK-SD-NEXT: add x8, sp, #1048 +; CHECK-SD-NEXT: add x9, sp, #728 +; CHECK-SD-NEXT: ld1 { v5.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #1056 +; CHECK-SD-NEXT: ld1 { v5.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #1064 +; CHECK-SD-NEXT: ld1 { v5.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #1072 +; CHECK-SD-NEXT: ld1 { v5.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #1080 +; CHECK-SD-NEXT: ld1 { v5.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #1088 +; CHECK-SD-NEXT: ld1 { v5.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #1096 +; CHECK-SD-NEXT: ld1 { v5.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v6.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #736 +; CHECK-SD-NEXT: add x9, sp, #784 +; CHECK-SD-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #744 +; CHECK-SD-NEXT: zip1 v4.2d, v4.2d, v5.2d +; CHECK-SD-NEXT: movi v5.2d, #0000000000000000 +; CHECK-SD-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #752 +; CHECK-SD-NEXT: sdot v19.4s, v4.16b, v1.16b +; CHECK-SD-NEXT: sdot v5.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #760 +; CHECK-SD-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #768 +; CHECK-SD-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #776 +; CHECK-SD-NEXT: ld1 { v6.b }[7], [x8] +; CHECK-SD-NEXT: ldr b7, [x9] +; CHECK-SD-NEXT: add x8, sp, #792 +; CHECK-SD-NEXT: add x9, sp, #856 +; CHECK-SD-NEXT: ld1 { v7.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #800 +; CHECK-SD-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #808 +; CHECK-SD-NEXT: ld1 { v7.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #816 +; CHECK-SD-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #824 +; CHECK-SD-NEXT: ld1 { v7.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #832 +; CHECK-SD-NEXT: ld1 { v7.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #840 +; CHECK-SD-NEXT: ld1 { v7.b }[7], [x8] +; CHECK-SD-NEXT: ld1 { v17.b }[1], [x9] +; CHECK-SD-NEXT: add x8, sp, #864 +; CHECK-SD-NEXT: add x9, sp, #16 +; CHECK-SD-NEXT: ld1 { v16.b }[8], [x9] +; CHECK-SD-NEXT: add x9, sp, #912 +; CHECK-SD-NEXT: ld1 { v17.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #872 +; CHECK-SD-NEXT: zip1 v0.2d, v6.2d, v7.2d +; CHECK-SD-NEXT: ld1 { v16.b }[9], [x10] +; CHECK-SD-NEXT: ld1 { v17.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #880 +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: ld1 { v17.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #888 +; CHECK-SD-NEXT: ld1 { v17.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #896 +; CHECK-SD-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #904 +; CHECK-SD-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-SD-NEXT: ldr b18, [x9] +; CHECK-SD-NEXT: add x8, sp, #920 +; CHECK-SD-NEXT: ld1 { v18.b }[1], [x8] +; CHECK-SD-NEXT: add x8, sp, #32 +; CHECK-SD-NEXT: ld1 { v16.b }[10], [x8] +; CHECK-SD-NEXT: add x8, sp, #928 +; CHECK-SD-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-SD-NEXT: add x8, sp, #40 +; CHECK-SD-NEXT: ld1 { v16.b }[11], [x8] +; CHECK-SD-NEXT: add x8, sp, #936 +; CHECK-SD-NEXT: ld1 { v18.b }[3], [x8] +; CHECK-SD-NEXT: add x8, sp, #48 +; CHECK-SD-NEXT: ld1 { v16.b }[12], [x8] +; CHECK-SD-NEXT: add x8, sp, #944 +; CHECK-SD-NEXT: ld1 { v18.b }[4], [x8] +; CHECK-SD-NEXT: add x8, sp, #56 +; CHECK-SD-NEXT: ld1 { v16.b }[13], [x8] +; CHECK-SD-NEXT: add x8, sp, #952 +; CHECK-SD-NEXT: ld1 { v18.b }[5], [x8] +; CHECK-SD-NEXT: add x8, sp, #64 +; CHECK-SD-NEXT: ld1 { v16.b }[14], [x8] +; CHECK-SD-NEXT: add x8, sp, #960 +; CHECK-SD-NEXT: ld1 { v18.b }[6], [x8] +; CHECK-SD-NEXT: add x8, sp, #72 +; CHECK-SD-NEXT: ld1 { v16.b }[15], [x8] +; CHECK-SD-NEXT: add x8, sp, #968 +; CHECK-SD-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-SD-NEXT: sdot v5.4s, v16.16b, v1.16b +; CHECK-SD-NEXT: zip1 v0.2d, v17.2d, v18.2d +; CHECK-SD-NEXT: sdot v5.4s, v2.16b, v1.16b +; CHECK-SD-NEXT: sdot v19.4s, v0.16b, v1.16b +; CHECK-SD-NEXT: add v0.4s, v5.4s, v19.4s ; CHECK-SD-NEXT: addv s0, v0.4s ; CHECK-SD-NEXT: fmov w0, s0 ; CHECK-SD-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/nontemporal.ll b/llvm/test/CodeGen/AArch64/nontemporal.ll index f8ba150a0405..f7a87ae340a7 100644 --- a/llvm/test/CodeGen/AArch64/nontemporal.ll +++ b/llvm/test/CodeGen/AArch64/nontemporal.ll @@ -683,41 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) { ; ; CHECK-BE-LABEL: test_stnp_v17f32: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 -; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 -; CHECK-BE-NEXT: ldr s16, [sp, #36] -; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5 ; CHECK-BE-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BE-NEXT: ldr s17, [sp, #4] -; CHECK-BE-NEXT: add x8, sp, #44 -; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: // kill: def $s0 killed $s0 def $q0 +; CHECK-BE-NEXT: // kill: def $s4 killed $s4 def $q4 +; CHECK-BE-NEXT: // kill: def $s5 killed $s5 def $q5 +; CHECK-BE-NEXT: add x8, sp, #12 +; CHECK-BE-NEXT: add x9, sp, #20 +; CHECK-BE-NEXT: ldr s16, [sp, #36] ; CHECK-BE-NEXT: mov v0.s[1], v1.s[0] +; CHECK-BE-NEXT: ldr s1, [sp, #4] +; CHECK-BE-NEXT: mov v4.s[1], v5.s[0] +; CHECK-BE-NEXT: add x10, sp, #52 ; CHECK-BE-NEXT: // kill: def $s6 killed $s6 def $q6 ; CHECK-BE-NEXT: // kill: def $s2 killed $s2 def $q2 ; CHECK-BE-NEXT: // kill: def $s7 killed $s7 def $q7 ; CHECK-BE-NEXT: // kill: def $s3 killed $s3 def $q3 -; CHECK-BE-NEXT: ldr s1, [sp, #68] -; CHECK-BE-NEXT: ld1 { v16.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #12 -; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] -; CHECK-BE-NEXT: add x8, sp, #52 -; CHECK-BE-NEXT: str s1, [x0, #64] -; CHECK-BE-NEXT: ld1 { v16.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #20 +; CHECK-BE-NEXT: ld1 { v1.s }[1], [x8] +; CHECK-BE-NEXT: ldr s5, [x9] +; CHECK-BE-NEXT: add x8, sp, #28 +; CHECK-BE-NEXT: add x9, sp, #44 +; CHECK-BE-NEXT: ld1 { v5.s }[1], [x8] +; CHECK-BE-NEXT: ld1 { v16.s }[1], [x9] +; CHECK-BE-NEXT: ldr s17, [x10] +; CHECK-BE-NEXT: add x8, sp, #60 ; CHECK-BE-NEXT: mov v4.s[2], v6.s[0] ; CHECK-BE-NEXT: mov v0.s[2], v2.s[0] -; CHECK-BE-NEXT: ld1 { v17.s }[2], [x8] -; CHECK-BE-NEXT: add x8, sp, #60 -; CHECK-BE-NEXT: ld1 { v16.s }[3], [x8] -; CHECK-BE-NEXT: add x8, sp, #28 -; CHECK-BE-NEXT: ld1 { v17.s }[3], [x8] -; CHECK-BE-NEXT: mov v4.s[3], v7.s[0] -; CHECK-BE-NEXT: add x8, x0, #48 -; CHECK-BE-NEXT: mov v0.s[3], v3.s[0] -; CHECK-BE-NEXT: st1 { v16.4s }, [x8] +; CHECK-BE-NEXT: ld1 { v17.s }[1], [x8] +; CHECK-BE-NEXT: ldr s2, [sp, #68] ; CHECK-BE-NEXT: add x8, x0, #32 -; CHECK-BE-NEXT: st1 { v17.4s }, [x8] +; CHECK-BE-NEXT: zip1 v1.2d, v1.2d, v5.2d +; CHECK-BE-NEXT: add x9, x0, #48 +; CHECK-BE-NEXT: str s2, [x0, #64] +; CHECK-BE-NEXT: zip1 v5.2d, v16.2d, v17.2d +; CHECK-BE-NEXT: mov v4.s[3], v7.s[0] +; CHECK-BE-NEXT: mov v0.s[3], v3.s[0] +; CHECK-BE-NEXT: st1 { v1.4s }, [x8] ; CHECK-BE-NEXT: add x8, x0, #16 +; CHECK-BE-NEXT: st1 { v5.4s }, [x9] ; CHECK-BE-NEXT: st1 { v4.4s }, [x8] ; CHECK-BE-NEXT: st1 { v0.4s }, [x0] ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/srem-seteq.ll b/llvm/test/CodeGen/AArch64/srem-seteq.ll index 4b8cbc46a610..3b344feebb58 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq.ll @@ -166,10 +166,9 @@ define i32 @test_srem_odd_setne(i32 %X) nounwind { ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #13106 // =0x3332 -; CHECK-NEXT: movk w9, #13107, lsl #16 +; CHECK-NEXT: mov w9, #858993459 // =0x33333333 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: cset w0, hs ; CHECK-NEXT: ret %srem = srem i32 %X, 5 %cmp = icmp ne i32 %srem, 0 @@ -186,10 +185,9 @@ define i32 @test_srem_negative_odd(i32 %X) nounwind { ; CHECK-NEXT: movk w8, #52428, lsl #16 ; CHECK-NEXT: movk w9, #6553, lsl #16 ; CHECK-NEXT: madd w8, w0, w8, w9 -; CHECK-NEXT: mov w9, #13106 // =0x3332 -; CHECK-NEXT: movk w9, #13107, lsl #16 +; CHECK-NEXT: mov w9, #858993459 // =0x33333333 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: cset w0, hs ; CHECK-NEXT: ret %srem = srem i32 %X, -5 %cmp = icmp ne i32 %srem, 0 diff --git a/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll new file mode 100644 index 000000000000..54fcae4ba28b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/trunc-avg-fold.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-- -O2 -mattr=+neon < %s | FileCheck %s + +define <8 x i8> @avgceil_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: avgceil_u_i8_to_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %a16 = zext <8 x i8> %a to <8 x i16> + %b16 = zext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.urhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %r = trunc <8 x i16> %avg16 to <8 x i8> + ret <8 x i8> %r +} + + +define <8 x i8> @test_avgceil_s(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: test_avgceil_s: +; CHECK: // %bb.0: +; CHECK-NEXT: srhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %a16 = sext <8 x i8> %a to <8 x i16> + %b16 = sext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.srhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %res = trunc <8 x i16> %avg16 to <8 x i8> + ret <8 x i8> %res +} + +define <8 x i8> @avgfloor_u_i8_to_i16(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: avgfloor_u_i8_to_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uhadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %a16 = zext <8 x i8> %a to <8 x i16> + %b16 = zext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.uhadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %res = trunc <8 x i16> %avg16 to <8 x i8> + ret <8 x i8> %res +} + +define <8 x i8> @test_avgfloor_s(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: test_avgfloor_s: +; CHECK: // %bb.0: +; CHECK-NEXT: shadd v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret + %a16 = sext <8 x i8> %a to <8 x i16> + %b16 = sext <8 x i8> %b to <8 x i16> + %avg16 = call <8 x i16> @llvm.aarch64.neon.shadd.v8i16(<8 x i16> %a16, <8 x i16> %b16) + %res = trunc <8 x i16> %avg16 to <8 x i8> + ret <8 x i8> %res +} + + diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll index 45726e92463b..bb5aa1fd0684 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-optsize.ll @@ -22,14 +22,13 @@ define i32 @test_optsize(i32 %X) optsize nounwind readnone { ; CHECK-LABEL: test_optsize: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: mov w9, #13108 // =0x3334 +; CHECK-NEXT: mov w9, #858993459 // =0x33333333 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #13107, lsl #16 ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: cmp w8, w9 ; CHECK-NEXT: mov w8, #-10 // =0xfffffff6 ; CHECK-NEXT: mov w9, #42 // =0x2a -; CHECK-NEXT: csel w0, w9, w8, lo +; CHECK-NEXT: csel w0, w9, w8, ls ; CHECK-NEXT: ret %rem = urem i32 %X, 5 %cmp = icmp eq i32 %rem, 0 diff --git a/llvm/test/CodeGen/AArch64/urem-seteq.ll b/llvm/test/CodeGen/AArch64/urem-seteq.ll index df87e60c4f8d..5473991e77c3 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq.ll @@ -9,12 +9,11 @@ define i32 @test_urem_odd(i32 %X) nounwind { ; CHECK-LABEL: test_urem_odd: ; CHECK: // %bb.0: ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: mov w9, #13108 // =0x3334 +; CHECK-NEXT: mov w9, #858993459 // =0x33333333 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #13107, lsl #16 ; CHECK-NEXT: mul w8, w0, w8 ; CHECK-NEXT: cmp w8, w9 -; CHECK-NEXT: cset w0, lo +; CHECK-NEXT: cset w0, ls ; CHECK-NEXT: ret %urem = urem i32 %X, 5 %cmp = icmp eq i32 %urem, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir index 2b84c6bcba7b..acbcb098e836 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir @@ -886,33 +886,34 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) - ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) + ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32) ; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64) ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 - ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C5]] + ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C5]] ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C4]](s32) ; SI-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C6]](s64) - ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1) + ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1) ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1) - ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32) - ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32) + ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY4]](s32) ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD1]], [[C6]](s64) ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1) ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1) ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) - ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32) + ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32) ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C5]] - ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32) + ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32) ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C6]](s64) ; SI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1) ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 5, addrspace 1) @@ -922,11 +923,12 @@ body: | ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) + ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) ; CI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 1, addrspace 1) ; CI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) ; CI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, align 1, addrspace 1) @@ -936,22 +938,23 @@ body: | ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) - ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) + ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32) ; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64) - ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64) + ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s64) ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C4]](s16) ; VI-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C5]](s64) - ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1) + ; VI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1) ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16) ; VI-NEXT: G_STORE [[ANYEXT]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1) ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) @@ -960,11 +963,11 @@ body: | ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1) ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR3]](s16) ; VI-NEXT: G_STORE [[ANYEXT1]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1) - ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) - ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64) - ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC4]], [[C4]](s16) + ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s64) + ; VI-NEXT: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC3]], [[C4]](s16) ; VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C5]](s64) - ; VI-NEXT: G_STORE [[TRUNC3]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1) + ; VI-NEXT: G_STORE [[TRUNC4]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1) ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16) ; VI-NEXT: G_STORE [[ANYEXT2]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 5, addrspace 1) ; @@ -973,11 +976,12 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) ; GFX9-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 1, addrspace 1) ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) ; GFX9-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, align 1, addrspace 1) @@ -998,17 +1002,18 @@ body: | ; SI-NEXT: {{ $}} ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) - ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) + ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32) + ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32) ; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64) - ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1) + ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s16), addrspace 1) ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 2, addrspace 1) ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) ; SI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1) @@ -1018,11 +1023,12 @@ body: | ; CI-NEXT: {{ $}} ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) + ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) ; CI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 2, addrspace 1) ; CI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) ; CI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1) @@ -1032,17 +1038,18 @@ body: | ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) - ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) + ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) + ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[TRUNC]](s32) ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32) + ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C2]](s32) ; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C3]](s64) - ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1) + ; VI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s16), addrspace 1) ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 2, addrspace 1) ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) ; VI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1) @@ -1052,11 +1059,12 @@ body: | ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) + ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64) ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY1]], [[C]](s32) + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY2]], [[C]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64) ; GFX9-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 2, addrspace 1) ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) ; GFX9-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir index a931c6366c40..7fd23197a5dd 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir @@ -285,13 +285,13 @@ body: | ; VI-NEXT: {{ $}} ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1 ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3 - ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) - ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64) + ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s64) + ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64) ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC1]], [[C]](s16) + ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[C]](s16) ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[COPY]], [[C1]](s64) - ; VI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s8), addrspace 1) + ; VI-NEXT: G_STORE [[TRUNC1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1) ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16) ; VI-NEXT: G_STORE [[ANYEXT]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 1, addrspace 1) %0:_(p1) = COPY $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index 01854c8560ce..637aaf752936 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -164,7 +164,7 @@ define zeroext i16 @v_mul_i16_zeroext(i16 zeroext %num, i16 zeroext %den) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_mul_i16_zeroext: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll new file mode 100644 index 000000000000..0aa08cc2b1d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/store-weird-size.ll @@ -0,0 +1,224 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -O0 -global-isel=true -stop-after=legalizer -o - %s | FileCheck -check-prefix=UNPACKED %s + +define void @store_i48(ptr addrspace(1) %ptr, i48 %arg) #0 { + ; UNPACKED-LABEL: name: store_i48 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV1]](s64) + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY4]], [[C]](s32) + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C1]](s64) + ; UNPACKED-NEXT: G_STORE [[COPY2]](s32), [[MV]](p1) :: (store (s32) into %ir.ptr, addrspace 1) + ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s16) into %ir.ptr + 4, align 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i48 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i55(ptr addrspace(1) %ptr, i55 %arg) #0 { + ; UNPACKED-LABEL: name: store_i55 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 36028797018963967 + ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]] + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[AND]](s64) + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY4]], [[C1]](s32) + ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64) + ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[MV]](p1) :: (store (s32) into %ir.ptr, addrspace 1) + ; UNPACKED-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; UNPACKED-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C3]](s32) + ; UNPACKED-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; UNPACKED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C4]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into %ir.ptr + 4, align 4, addrspace 1) + ; UNPACKED-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into %ir.ptr + 6, align 2, basealign 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i55 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i56(ptr addrspace(1) %ptr, i56 %arg) #0 { + ; UNPACKED-LABEL: name: store_i56 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY [[MV1]](s64) + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY4]], [[C]](s32) + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C1]](s64) + ; UNPACKED-NEXT: G_STORE [[COPY2]](s32), [[MV]](p1) :: (store (s32) into %ir.ptr, addrspace 1) + ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNPACKED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C2]](s32) + ; UNPACKED-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; UNPACKED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C3]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s16) into %ir.ptr + 4, align 4, addrspace 1) + ; UNPACKED-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into %ir.ptr + 6, align 2, basealign 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i56 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i65(ptr addrspace(1) %ptr, i65 %arg) #0 { + ; UNPACKED-LABEL: name: store_i65 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[DEF]](s32) + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]] + ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64) + ; UNPACKED-NEXT: G_STORE [[AND]](s64), [[MV]](p1) :: (store (s64) into %ir.ptr, align 4, addrspace 1) + ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s8) into %ir.ptr + 8, align 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i65 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i95(ptr addrspace(1) %ptr, i95 %arg) #0 { + ; UNPACKED-LABEL: name: store_i95 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; UNPACKED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[DEF]](s32) + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2147483647 + ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]] + ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64) + ; UNPACKED-NEXT: G_STORE [[AND]](s64), [[MV]](p1) :: (store (s64) into %ir.ptr, align 4, addrspace 1) + ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s32) into %ir.ptr + 8, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i95 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i96(ptr addrspace(1) %ptr, i96 %arg) #0 { + ; UNPACKED-LABEL: name: store_i96 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32) + ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[MV1]](s96) + ; UNPACKED-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[MV]](p1) :: (store (<3 x s32>) into %ir.ptr, align 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i96 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i97(ptr addrspace(1) %ptr, i97 %arg) #0 { + ; UNPACKED-LABEL: name: store_i97 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934591 + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]] + ; UNPACKED-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; UNPACKED-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[MV]], [[C2]](s64) + ; UNPACKED-NEXT: G_STORE [[AND]](s64), [[MV]](p1) :: (store (s64) into %ir.ptr, align 4, addrspace 1) + ; UNPACKED-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNPACKED-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[AND1]], [[C3]](s32) + ; UNPACKED-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; UNPACKED-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = nuw inbounds G_PTR_ADD [[PTR_ADD]], [[C4]](s64) + ; UNPACKED-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC]](s32), [[PTR_ADD]](p1) :: (store (s32) into %ir.ptr + 8, addrspace 1) + ; UNPACKED-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64) + ; UNPACKED-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into %ir.ptr + 12, align 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i97 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +define void @store_i127(ptr addrspace(1) %ptr, i127 %arg) #0 { + ; UNPACKED-LABEL: name: store_i127 + ; UNPACKED: bb.1 (%ir-block.0): + ; UNPACKED-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 + ; UNPACKED-NEXT: {{ $}} + ; UNPACKED-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; UNPACKED-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; UNPACKED-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; UNPACKED-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; UNPACKED-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; UNPACKED-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; UNPACKED-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5 + ; UNPACKED-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1 + ; UNPACKED-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807 + ; UNPACKED-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; UNPACKED-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32) + ; UNPACKED-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C]] + ; UNPACKED-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV2]], [[C1]] + ; UNPACKED-NEXT: [[MV3:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[AND]](s64), [[AND1]](s64) + ; UNPACKED-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[MV3]](s128) + ; UNPACKED-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[MV]](p1) :: (store (<4 x s32>) into %ir.ptr, align 4, addrspace 1) + ; UNPACKED-NEXT: SI_RETURN + store i127 %arg, ptr addrspace(1) %ptr, align 4 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index 0d5f538215f1..d03d6a8940b2 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -6309,64 +6309,64 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -6394,50 +6394,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -6498,50 +6498,50 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_4: ; %end @@ -6549,307 +6549,266 @@ define <128 x i8> @bitcast_v32i32_to_v128i8(<32 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -15413,63 +15372,63 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -15483,144 +15442,143 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -15634,746 +15592,660 @@ define <32 x i32> @bitcast_v128i8_to_v32i32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -42156,64 +42028,64 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -42241,50 +42113,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -42328,50 +42200,50 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %end @@ -42379,307 +42251,266 @@ define <128 x i8> @bitcast_v32f32_to_v128i8(<32 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -52210,63 +52041,63 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -52280,144 +52111,143 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -52431,746 +52261,660 @@ define <32 x float> @bitcast_v128i8_to_v32f32(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB38_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB38_2 ; GFX11-TRUE16-NEXT: .LBB38_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -77938,64 +77682,64 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -78023,50 +77767,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -78135,50 +77879,50 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB56_4: ; %end @@ -78186,307 +77930,266 @@ define <128 x i8> @bitcast_v16i64_to_v128i8(<16 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -87060,63 +86763,63 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -87130,144 +86833,143 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -87281,746 +86983,660 @@ define <16 x i64> @bitcast_v128i8_to_v16i64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -111800,64 +111416,64 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) @@ -111885,50 +111501,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] @@ -111972,50 +111588,50 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 24, v32 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v28 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v27 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v26 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v24 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v22 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v21 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v20 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v18 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v9 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v1 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %end @@ -112023,307 +111639,266 @@ define <128 x i8> @bitcast_v16f64_to_v128i8(<16 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v66, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v55, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v66, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v55, v39 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v65 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v55, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v55, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v55, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l ; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l ; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v53, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v53, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v129.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v53 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v51, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v51, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v50, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v50, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v49, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v50 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v49, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v48, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v82.l ; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v48, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v49 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v69.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v39 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v68.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v39 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v39.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v39 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -121839,63 +121414,63 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:384 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:380 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:376 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:372 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:368 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:364 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:360 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:356 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:352 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:348 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:344 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:372 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v50, off, s32 offset:368 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:364 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v51, off, s32 offset:360 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:356 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:352 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:348 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:344 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:340 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v52, off, s32 offset:336 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:332 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v53, off, s32 offset:328 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:324 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:324 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v53, off, s32 offset:320 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:316 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:316 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v54, off, s32 offset:312 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:308 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v54, off, s32 offset:304 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:300 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v55, off, s32 offset:296 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:292 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:288 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:284 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:280 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:276 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:272 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:268 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:264 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:292 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:288 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:284 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:280 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:276 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:272 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v39, off, s32 offset:268 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:264 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v39, off, s32 offset:260 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:256 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:256 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v48, off, s32 offset:252 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:248 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v48, off, s32 offset:244 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:240 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:240 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v49, off, s32 offset:236 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v70, off, s32 offset:232 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v51, off, s32 offset:228 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:224 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v52, off, s32 offset:220 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:216 -; GFX11-TRUE16-NEXT: scratch_load_b32 v103, off, s32 offset:388 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v70, off, s32 offset:232 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v49, off, s32 offset:228 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:224 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v50, off, s32 offset:220 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:216 +; GFX11-TRUE16-NEXT: scratch_load_b32 v114, off, s32 offset:388 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v82, off, s32 offset:8 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:16 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v83, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v85, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v87, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v97, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v99, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:96 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v101, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:112 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v160, off, s32 offset:120 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v160, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v161, off, s32 offset:136 @@ -121909,144 +121484,143 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v164, off, s32 offset:192 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v165, off, s32 offset:200 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v165, off, s32 offset:208 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:212 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:204 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:196 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:188 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:180 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:172 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v71, off, s32 offset:164 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v71, off, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v81, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v83, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v55, off, s32 offset:212 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:204 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:196 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 offset:188 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v68, off, s32 offset:180 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v69, off, s32 offset:172 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v69, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v80, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v81, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v82, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v84, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v85, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v84, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v86, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v87, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v86, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v96, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v97, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v98, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v100, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v102, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v102, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v112, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v114, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v96, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v98, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v99, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v100, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v101, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v103, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v103, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v112, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v113, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v115, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v116, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v115, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.l, v30.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v28.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v130.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v134.h, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v147.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v149.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v146.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v134.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.h, 8, v29.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(62) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v145.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v151.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v150.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v148.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v147.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v144.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v145.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v135.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v133.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v131.h, 8, v29.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(54) -; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v103 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v114 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(53) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.l, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v132.l, 8, v81.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(52) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v130.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v82.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(51) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.h, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(50) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v129.l, 8, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.l, 8, v83.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(49) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v84.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v128.h, 8, v85.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(48) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v119.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v85.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(47) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v118.l, 8, v87.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(46) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v117.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.l, 8, v87.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(45) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v116.h, 8, v97.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(44) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v115.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.l, 8, v97.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(43) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v114.h, 8, v98.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(42) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v112.h, 8, v99.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(41) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.l, 8, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v113.l, 8, v100.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(40) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v103.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.l, 8, v101.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(39) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v101.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v102.h, 8, v102.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(38) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v100.h, 8, v160.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(37) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v160.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v101.l, 8, v160.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(36) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.h, 8, v161.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(35) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v161.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v99.l, 8, v161.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(34) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v98.l, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.l, 8, v162.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v162.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v97.h, 8, v162.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v96.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.l, 8, v163.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v163.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v87.h, 8, v163.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v86.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.l, 8, v164.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v164.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v85.h, 8, v164.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v84.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.l, 8, v165.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.l, 8, v70.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v80.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v83.h, 8, v165.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v81.h, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v82.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v71.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v70.h, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v67.h, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v68.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.l, 8, v64.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v66.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v65.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v54.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.h, 8, v54.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v53.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v51.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v49.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.l, 8, v55.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v64.h, 8, v54.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v53.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v52.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v51.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v50.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -122060,746 +121634,660 @@ define <16 x double> @bitcast_v128i8_to_v16f64(<128 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB74_3: ; %cmp.false -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v151.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v150.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v146.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v151.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v150.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v147.h -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v0.h, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v149.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v148.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v144.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v145.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v1.h, v147.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v149.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v149.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v146.l +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v0.l, v151.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v151.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v0.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v150.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v145.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v144.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v149, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v1.h, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v131.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v148.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v130.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v2.l, v148.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v134.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v2.l, v146.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v131.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v135.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v3.l, v145.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v128.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v134.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v145.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v130.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v149, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v3.l, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v147.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v3.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v135.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v149, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v4.l, v144.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v4.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v133.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v117.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v4.l, v135.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v129.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v134.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v5.l, v133.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v132.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v114.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v6.l, v132.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v130.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v112.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v7.l, v130.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v129.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v102.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v8.l, v128.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v119.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v100.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v9.l, v118.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v117.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v10.l, v116.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v115.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v11.l, v114.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v11.h, v113.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v12.l, v113.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v103.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v83.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v13.l, v103.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v101.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v14.l, v101.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v99.h -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v71.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v15.l, v99.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v15.h, v98.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v69.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v16.l, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v16.h, v96.h -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v67.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v17.l, v87.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v115.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v149, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v5.l, v135.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v5.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v132.l +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v129.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v112.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v149, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v6.l, v133.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v6.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v128.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v103.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v149, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v7.l, v131.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v7.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v118.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v100.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v98.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v149, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v8.l, v129.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v116.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v114.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v96.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v149, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v9.l, v128.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v9.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v86.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v149, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v10.l, v117.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v10.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v102.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v84.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v81.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v149, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v11.l, v116.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v11.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v101.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v80.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v149, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v12.l, v114.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v12.l, v149.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v69.h -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v17.h, v86.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v64.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v18.l, v85.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v67.h -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v18.h, v84.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v19.l, v83.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v97.h +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v69.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v68.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v149, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v13.l, v112.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v13.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v87.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v19.h, v82.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v20.l, v82.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v20.h, v80.h -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v21.l, v80.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v21.h, v70.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v22.l, v70.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v22.h, v68.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v23.l, v68.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v23.h, v66.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v24.l, v66.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v24.h, v65.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v27, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v25.l, v64.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v25.h, v55.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v26.l, v55.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v26.h, v54.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v27.l, v54.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v65.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v149, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v14.l, v102.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v14.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v85.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v83.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v149, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v15.l, v100.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v15.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v149, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v16.l, v98.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v16.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v71.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v149, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v17.l, v97.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v17.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v70.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v149, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v18.l, v87.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v18.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v66.h +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v149, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v19.l, v85.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v19.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v149, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v20.l, v83.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v20.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v55.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v149, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v21.l, v81.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v21.l, v149.h +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v149, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v22.l, v71.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v22.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v32.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v28.h, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v34.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v28.l, v53.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v28.h, v52.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v33.l, v29.h, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v34 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v29.l, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v149, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v23.l, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v23.l, v149.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v51.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v30.l, v50.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v32.l, v49.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v149, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v24.l, v67.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v24.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v149, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v25.l, v66.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v25.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v149, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v26.l, v64.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v26.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v149, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v27.l, v54.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v27.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v149, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v28.l, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v28.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v149, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v29.l, v52.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v29.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v149, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v30.l, v51.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v30.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v149, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v149.l, v31.l, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v31.l, v149.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v149, v31 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB74_2 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %cmp.true -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v151.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v148.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v150.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v146.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v147.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v149.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v149.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v146.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v146.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v144.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v149.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v151.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v151.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v150.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v150.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v145.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v144.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v134.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v31, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v134.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v148.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v148.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v149.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v147.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v134.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v133.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v146.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v148.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v145.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v31, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v31.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v147.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v147.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v132.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v131.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v145.h, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v131.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v31, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v130.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v130.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v144.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v145.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v128.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v135.h, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v135.l, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v129.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.l, v32.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v134.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v118.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v133.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v31, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v135.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v135.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v119.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v119.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v31, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v118.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v117.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v133.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v133.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v132.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v31, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v131.h, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v132.l, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v116.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v132.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v117.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v130.h, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v114.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v130.l, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v115.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v115.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v31, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v113.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v112.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v129.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v129.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v129.l, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v112.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v128.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v112.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v119.h, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v102.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v118.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v102.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v31, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v128.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v128.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v103.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v103.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v31, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v101.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v100.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v117.h, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v118.l, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v117.h, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v100.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v116.l, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v100.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v115.h, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v97.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v114.h, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v98.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v31, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v116.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v116.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v99.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v98.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v31, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v96.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v96.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v114.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v114.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v113.h, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v87.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v14, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v113.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v96.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v103.h, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v85.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v103.l, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v13.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v31, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v112.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v113.l, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v86.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v86.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v31, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v84.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v84.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v102.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v102.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v101.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, v83.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v101.l, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v84.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v99.h, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v31, v16 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v100.h, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v101.l, v14.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v82.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v81.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v99.l, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v81.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v31, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.l, v80.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v16.h, v80.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v98.h, v15.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v99.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v98.l, v16.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, v71.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v18, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v97.l, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, v71.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v16.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, 0x300, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v96.h, v17.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v19, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v31, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v97.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v97.h, v16.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v69.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v17.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v69.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v31, v19 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v17.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, 0x300, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.l, v68.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, v67.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v87.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v87.h, v17.h ; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v86.h, v18.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.h, v67.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v20, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v85.l, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v19.l, v67.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v18.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, 0x300, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v84.h, v19.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v64.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v21, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v83.h, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v31, v20 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v85.l, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v85.h, v18.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v65.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v19.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v65.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v31, v21 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v19.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, 0x300, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.l, v55.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v20.h, v50.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v83.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v83.h, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v82.h, v20.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, v51.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v82.l, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, v52.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v20.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, 0x300, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v80.h, v21.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v23, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v80.l, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v24, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v21.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v31, v22 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v81.h, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v82.l, v20.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v31, v23 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v21.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, 0x300, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v22.h, v48.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v71.l, v21.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v71.h, v21.h ; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v70.h, v22.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v24, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v70.l, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v23.l, v48.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v22.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v22.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v68.h, v23.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v25, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v68.l, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v23.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v31, v24 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v70.l, v22.l +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v70.h, v22.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v39.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v31, v25 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v23.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, 0x300, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v67.h, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v68.l, v23.h ; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v66.l, v24.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v26, v31 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v66.h, v24.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v24.l, 0x300, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v64.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v26, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v65.l, v25.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v27, 0xffff, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v25.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v31, v26 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v66.l, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v66.h, v24.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v31, v27 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v25.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, 0x300, v25.h +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v26.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v64.l, v25.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v64.h, v25.h ; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v27, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v55.l, v26.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v55.h, v26.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, 0x300, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v29, v31 -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v54.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.l, 0x300, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v29, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v53.h, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v31, v28 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v54.h, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v55.l, v26.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v31, v29 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v27.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, 0x300, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v28.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v53.h, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v54.l, v27.h ; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v29, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, 0x300, v28.h +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v31, v30 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v34.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v53.l, v28.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v33.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v33.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v53.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v30.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v31, v34 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v29.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v52.h, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v50.h, v29.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v30.h, v32.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v51.h, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v52.l, v29.h ; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v29.l, 0x300, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v51.l, v30.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v50.l, v30.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.l, 0x300, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v31 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v30.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v33 -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v49.h, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v31 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.h, 0x300, v32.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v31 +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v33.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v50.h, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v51.l, v30.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v31, v33 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v31.l, 0x300, v32.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v32.h, 0x300, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v31, v32 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -160089,159 +159577,162 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:156 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:136 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:32 -; GFX11-TRUE16-NEXT: s_clause 0x4 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v40, s32 offset:168 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v41, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v42, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v43, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v44, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v45, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v46, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v47, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v56, s32 offset:136 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v57, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v58, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v59, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v60, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v61, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v62, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v63, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v72, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v73, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v74, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v75, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v76, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v77, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v78, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v79, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v88, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v89, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v90, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v91, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v92, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v93, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v94, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v95, s32 offset:44 +; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v104, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v105, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v106, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v107, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v108, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v109, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v110, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_store_b32 off, v111, s32 offset:12 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr111_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr106_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr105_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr88_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr76_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr104_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr78_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr95_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr93_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr43_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr74_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr63_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr178_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr60_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr73_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr57_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr41_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr47_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr44_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr45_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr92_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr40_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr59_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr182_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr177_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr62_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr180_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr108_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr176_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr91_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr89_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr110_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr107_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr109_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr94_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr90_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr79_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr77_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr75_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr72_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr61_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr58_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr56_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr46_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr42_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr183_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr181_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr179_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr167_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -160250,143 +159741,142 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_2 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v3 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v132.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v164.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v148.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v144.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v180.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v165.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v161.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v47.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v179.h, v8.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v131.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v128.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v129.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v146.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v133.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v135.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v166.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v150.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v151.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v43.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v177.h, v8.l ; GFX11-TRUE16-NEXT: v_mov_b16_e64 v178.h, v8.h ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v73.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v61.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v57.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v104.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v78.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v77.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v95.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v93.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v21.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v24.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v26.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v30.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v31.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v32.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v41.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v44.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v92.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v59.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v62.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v108.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v91.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v89.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v110.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v107.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v109.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v84.h, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v96.h, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v99.h, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v97.h, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v98.h, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v102.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v100.h, v26.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v101.h, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v113.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v103.h, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v112.h, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v116.h, v29.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v114.h, v30.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v115.h, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v119.h, v31.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v117.h, v32.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v118.h, v32.h ; GFX11-TRUE16-NEXT: .LBB90_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB90_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true ; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff0000, v18 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v18, 16, v18 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v18, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v37, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v70, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v80, v37, v39 :: v_dual_add_f32 v33, 0x40c00000, v33 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff0000, v17 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v70.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v80.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v36, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v36, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_dual_add_f32 v34, 0x40c00000, v34 :: v_dual_lshlrev_b32 v17, 16, v17 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 @@ -160399,498 +159889,500 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v48, v34, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v50, v17, 0x7fff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v71, v37, v51 :: v_dual_lshlrev_b32 v20, 16, v20 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v82, v37, v51 :: v_dual_and_b32 v35, 0xffff0000, v20 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v20, 16, v20 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v51, 0xffff0000, v11 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v71.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v82.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v20, 0x40c00000, v20 ; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v49, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v55 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 +; GFX11-TRUE16-NEXT: v_bfi_b32 v18, 0xffff, v33, v81 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v20, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_bfi_b32 v17, 0xffff, v34, v17 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v36, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v19, 16, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v20, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v62, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 8, v18 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v19, 16, v19 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v22 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v22, 16, v22 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v19, 0x40c00000, v19 :: v_dual_lshlrev_b32 v22, 16, v22 ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v80, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v83.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v22, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v81.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v79, 8, v17 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v84, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v17 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v19, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v22, 0x40c00000, v22 :: v_dual_cndmask_b32 v85, v33, v37 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v22, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v84.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v85.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v20, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v20, v33, v22, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v22 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v22, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v20, v33 :: v_dual_add_f32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v84 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v86.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v20, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v20, 0xffff, v34, v80 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v19, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v24, 16, v24 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v83.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 8, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 -; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v82 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 8, v20 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v22 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v19, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v19, 0xffff, v37, v36 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v24 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v33, 0x40c00000, v38 :: v_dual_lshlrev_b32 v24, 16, v24 +; GFX11-TRUE16-NEXT: v_bfi_b32 v22, 0xffff, v22, v87 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v21, 16, 1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v24, 0x40c00000, v24 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v21, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v86, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 24, v22 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v23, 16, v23 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v24, 16, 1 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v22 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v21, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v24 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v24, 0x7fff +; GFX11-TRUE16-NEXT: v_add_f32_e32 v23, 0x40c00000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v24, v24 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v86.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v23, 0x40c00000, v23 :: v_dual_lshlrev_b32 v26, 16, v26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v96.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v77, 8, v19 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v26, 16, v26 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v23, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v23 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v23, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 8, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v23, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v97, v34, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v37 :: v_dual_add_f32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v97.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v26, 0x40c00000, v26 +; GFX11-TRUE16-NEXT: v_bfi_b32 v21, 0xffff, v35, v21 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v61, 8, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v99.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v23, v24, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_bfi_b32 v23, 0xffff, v36, v23 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v87.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v23 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v26 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v85 +; GFX11-TRUE16-NEXT: v_bfi_b32 v24, 0xffff, v33, v98 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v26, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 24, v24 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v25, 0x40c00000, v25 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v26, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v46, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 24, v24 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v26, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v26, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v177, 8, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v98, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 8, v24 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v25, 16, v25 -; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v96, v35, v38 :: v_dual_add_f32 v25, 0x40c00000, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v28, 16, v28 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v98.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v25, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v25 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_add_f32 v28, 0x40c00000, v28 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add3_u32 v26, v26, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v25, 0x7fff -; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v101, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v25, v25 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v28 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v100.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v102, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v28, 16, v28 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v102.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v28, 0x40c00000, v28 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v26, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v28, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 +; GFX11-TRUE16-NEXT: v_add3_u32 v25, v25, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 ; GFX11-TRUE16-NEXT: v_add3_u32 v26, v33, v28, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v28 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v27 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v28, v28 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v27, 16, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v101.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v100, v26, v33, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v103, v26, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v27, 0x40c00000, v27 -; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v96 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v100.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v99, v25, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_bfi_b32 v26, 0xffff, v34, v101 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v27, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v103.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v25, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v25, 0xffff, v37, v36 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v27, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v27 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v27, v27 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v30 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v30, 0x40c00000, v30 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v112, v34, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v99 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v26 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_bfi_b32 v28, 0xffff, v28, v112 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v29 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v29, 16, v29 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v30, 16, v30 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v30, 0x40c00000, v30 :: v_dual_lshlrev_b32 v29, 16, v29 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v30, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v30 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v30, v30 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v112.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v28 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v25 -; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v103, v34, v38 :: v_dual_and_b32 v38, 0xffff0000, v32 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v29, 0x40c00000, v29 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v27 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v102, v33, v39 :: v_dual_add_f32 v37, 0x40c00000, v37 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v103.h +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v30, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v30, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v32, 0x40c00000, v32 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v113, v34, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v113.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v28 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v34, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v32, 16, v32 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v29, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v29 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v29, v29 ; GFX11-TRUE16-NEXT: v_add3_u32 v30, v30, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v29, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.l, v114.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v27, 0xffff, v35, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v26 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v116, v34, v36, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v113.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v179, 8, v26 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v116.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v30, v39, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v102 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_bfi_b32 v30, 0xffff, v33, v115 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v181, 8, v25 ; GFX11-TRUE16-NEXT: v_bfi_b32 v29, 0xffff, v36, v29 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32 -; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v31 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v30 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v115, v33, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v31, 16, v31 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v29 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v115.h +; GFX11-TRUE16-NEXT: v_dual_add_f32 v32, 0x40c00000, v32 :: v_dual_lshlrev_b32 v31, 16, v31 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v30 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v31, 0x40c00000, v31 -; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v114, v35, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v32, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v32 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v32, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v29 +; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v32, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v117, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v31, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v31 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 -; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v117.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v31, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v116, v33, v37 :: v_dual_and_b32 v35, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v2, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v118, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v2 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v31, v31 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v32, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v119, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v116.h -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v31, v35, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 +; GFX11-TRUE16-NEXT: v_add3_u32 v32, v32, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v119.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v32, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v2, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v31, v31, v35, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v32, v33, v2, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v32, v33, vcc_lo -; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v128, v32, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v114 +; GFX11-TRUE16-NEXT: v_bfi_b32 v32, 0xffff, v34, v118 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v132, v31, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v128.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v129, v31, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v31, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v33, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v2.l, v133.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 24, v32 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v37 :: v_dual_and_b32 v37, 0xffff0000, v3 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v131, v34, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v132 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v33, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_lshlrev_b32 v3, 16, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v4, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v1, v35, v38 :: v_dual_add_f32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v35, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v148, v34, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v131.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v129 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 -; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v133, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v6 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v144, v33, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v148.h -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v35.l, v146.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v164, v34, v36, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v34, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v35, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v107, 8, v2 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v164.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v39, vcc_lo -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v105, 24, v2 +; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v3, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v135, v33, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v36, 0x400000, v3 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v6, 0x40c00000, v6 +; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v3, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v111, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v32 +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v146, v34, v36 :: v_dual_add_f32 v37, 0x40c00000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v36.l, v146.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v37, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v37, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v39 :: v_dual_add_f32 v34, 0x40c00000, v38 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v7 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v36, v3 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v33.l, v133.h +; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v34, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v144 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v36, 0x40c00000, v36 :: v_dual_add_f32 v5, 0x40c00000, v5 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v90, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v94, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v106, 8, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v108, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v32 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v33, v135 +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v6, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v37, v34, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v6 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v34 ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v6, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v36, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v31 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v165, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v93, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v95, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v104, 8, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v150, v33, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v34, v34 ; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v36, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v165.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v161, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v151, v35, v38, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v33, v33, v5, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v36 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v180, v33, v37 :: v_dual_add_f32 v35, 0x40c00000, v35 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v34.l, v150.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v166, v33, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v35, 0x40c00000, v35 :: v_dual_lshlrev_b32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v166.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v37.l, v180.h +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v35, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v6, v38, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v33, v8, 16, 1 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v33, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v33, 0x400000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v179, v6, v33, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v177, v6, v33, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v161 -; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v179.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v178, v5, v38 :: v_dual_add_f32 v33, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v33, 0x40c00000, v39 +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v34, v151 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v178, v5, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v37, v36 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v36, 16, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v36, 16, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v34, v33, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 +; GFX11-TRUE16-NEXT: v_mov_b16_e64 v8.l, v177.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v36, 0x40c00000, v36 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v33 ; GFX11-TRUE16-NEXT: v_add3_u32 v34, v34, v33, 0x7fff ; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v178 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v47, v35, v37, vcc_lo +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v36, 16, 1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v60, 24, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v43, v35, v37, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v33, v33 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v59, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v72, 8, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v36, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v36 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v63, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v78, 8, v6 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v34, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v36, v36 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v47.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v75, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v91, 8, v5 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v44, v7, v37, vcc_lo -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v10, 0x40c00000, v10 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v43.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v88, 8, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v41, v7, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v10, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v10 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v10, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v41, v35, v38 :: v_dual_lshlrev_b32 v10, 16, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v44, v35, v38 :: v_dual_and_b32 v39, 0xffff0000, v9 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v44.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v41.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v38, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v41 -; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v35, v44 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add3_u32 v38, v38, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v51 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v42, 24, v10 -; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v61, v38, v50 :: v_dual_add_f32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v45, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v47, 8, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v59, v38, v50, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v12, 0x40c00000, v12 :: v_dual_lshlrev_b32 v7, 16, v9 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v61.h +; GFX11-TRUE16-NEXT: v_dual_add_f32 v37, 0x40c00000, v51 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v52, 0x400000, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 +; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v7, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v7 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 +; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v35, v35, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v59.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v73, v35, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v9, 0x40c00000, v39 ; GFX11-TRUE16-NEXT: v_bfe_u32 v35, v37, 16, 1 -; GFX11-TRUE16-NEXT: v_add3_u32 v48, v48, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v56, 8, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v57, v48, v52, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 ; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v14, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v57 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v62, v48, v52, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v36, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v39, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v62 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add3_u32 v36, v36, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v35, v37, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v37 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v180, 24, v12 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v36, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v167, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v183, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v182, 8, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v35, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v35, 0x40c00000, v38 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v39, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v35, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v39 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v39 :: v_dual_cndmask_b32 v92, v37, v38 ; GFX11-TRUE16-NEXT: v_add3_u32 v37, v48, v35, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v35 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v35, v35 @@ -160898,18 +160390,18 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v49, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v77, v37, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v89, v37, v38, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v35, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v49, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v78, v39, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v91, v39, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v78.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v73.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v35 :: v_dual_add_f32 v14, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v37, 16, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v39, v13, 0x7fff @@ -160919,7 +160411,7 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v37, 0x40c00000, v37 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v104, v35, v39, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v108, v35, v39, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v39, v37, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 @@ -160927,405 +160419,366 @@ define <128 x i8> @bitcast_v64bf16_to_v128i8(<64 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v51, 0x400000, v37 ; GFX11-TRUE16-NEXT: v_add3_u32 v39, v39, v37, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v93, v13, v49, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v107, v13, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v37, v37 ; GFX11-TRUE16-NEXT: v_add3_u32 v35, v48, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v50, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v95, v39, v51, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v110, v39, v51, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v104.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v108.h ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v50, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v89.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v77 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v92, v35, v48, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v91.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v92.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v109, v35, v48, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v95.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v93.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v110.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v107.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v38, v89 ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v39, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v14 -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v92 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v36, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v35, v109 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v37, v7 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v34, v33 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v176, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v43, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v58, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v76, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v40, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v57, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v74, 8, v7 ; GFX11-TRUE16-NEXT: .LBB90_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v146.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v108.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v131.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v111.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v133.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v107.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v68.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v132.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v129.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v128.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v106.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v164.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v105.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v94.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v91.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v148.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v180.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v90.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v144.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v4.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v165.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v88.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v47.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v76.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v58.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v75.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v161.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v179.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v72.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v6.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v105.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v146.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v104.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v95.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v135.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v133.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v93.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v166.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v88.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v6, v4 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v78.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v150.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v151.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v76.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v6, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v43.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v74.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v8.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v73.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v178.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v59.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v56.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v8.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v44.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v43.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v89.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v41.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v42.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v61.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v183.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v11.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v104.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v176.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v166.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v167.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v57.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v78.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v62.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v6, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v177.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v63.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v178.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v60.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v180.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v68, v6, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v73.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v57.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v69, v6, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v41.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v47.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v44.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v45.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v92.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v40.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v6, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v59.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v182.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v9 +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v89.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v108.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v176.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.l, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v6, v10 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v91.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v6, v11 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v110.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v161.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v109.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v6, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v147.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v107.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v6, v13 +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v82.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v94.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v6, v14 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v79.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.l, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v90.l +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.l, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v6, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v77.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v84.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v6, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v72.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v75.l +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v6, v17 +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v96.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v61.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v87.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v6, v18 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v56.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.l, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v58.l +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v6, v19 +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v99.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v46.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v98.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v6, v20 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v183.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v97.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v42.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.l, v22.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v6, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v102.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v181.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v101.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v6, v22 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v167.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v100.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v179.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.l, v24.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v6, v23 +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v113.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v112.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v6, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.l, v25.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v103.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.l, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v6, v25 +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v116.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v115.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v6, v26 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.l, v27.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v114.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v6, v27 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v119.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v118.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v6, v28 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v117.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v132.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v12.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v12.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v77.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v95.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v64, v18, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v93.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v149.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v65, v18, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v79.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v92.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v134.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v13.h, v15.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v66, v18, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v70.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v74.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v67, v18, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v46.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v84.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v63.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v62.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v81.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v60.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v20, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v80.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v45.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v19, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v13.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v83.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v40.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v19, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v97.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v182.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v181.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v19.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v22, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v87.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v177.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v22, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v13.h, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v21.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v101.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v163.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v22, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v162.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v98.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v26, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v26, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v28 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v112.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v96.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v145.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v25, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.h, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v100.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v135.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v24.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v25, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v113.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v131.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v26 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v99.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v25.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v28, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v103.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v26.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v129.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v28, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v13.h, v27.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v116.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v128.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v28, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v102.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.l, 8, v119.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v13.h -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v32, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v115.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v28.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v13.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v13.h, v28.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v32, v14 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v31, 0xffff, v34 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v114.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v30, 0xffff, v30 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v31, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.l, v13.h +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[66:69], off offset:16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v6, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.l, v30.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v30, v14 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v6, v5 ; GFX11-TRUE16-NEXT: s_clause 0x5 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 -; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[64:67], off offset:48 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[7:10], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[11:14], off offset:48 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[15:18], off offset:64 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[19:22], off offset:80 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[23:26], off offset:96 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[27:30], off offset:112 ; GFX11-TRUE16-NEXT: s_clause 0x1f -; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:12 -; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:16 -; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:24 -; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:32 -; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:40 -; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:48 -; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:52 -; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:56 -; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:60 -; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:64 -; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:72 -; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:80 -; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:84 -; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:92 -; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:100 -; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:108 -; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:116 -; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:124 -; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:128 -; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:132 -; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:136 -; GFX11-TRUE16-NEXT: s_clause 0x4 -; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:140 -; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:144 -; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:148 -; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:152 -; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v111, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v110, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v109, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v108, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v107, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v106, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v105, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v104, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v95, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v94, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v93, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v92, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v91, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v90, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v89, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v88, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v79, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v78, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v77, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v76, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v75, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v74, off, s32 offset:96 +; GFX11-TRUE16-NEXT: scratch_load_b32 v73, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v72, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v63, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v62, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v61, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v60, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v59, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_b32 v58, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v57, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v56, off, s32 offset:136 +; GFX11-TRUE16-NEXT: s_clause 0x7 +; GFX11-TRUE16-NEXT: scratch_load_b32 v47, off, s32 offset:140 +; GFX11-TRUE16-NEXT: scratch_load_b32 v46, off, s32 offset:144 +; GFX11-TRUE16-NEXT: scratch_load_b32 v45, off, s32 offset:148 +; GFX11-TRUE16-NEXT: scratch_load_b32 v44, off, s32 offset:152 +; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s32 offset:156 +; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s32 offset:160 +; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s32 offset:164 +; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s32 offset:168 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -185302,69 +184755,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -185375,69 +184828,69 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB94_4 @@ -185446,405 +184899,364 @@ define <128 x i8> @bitcast_v64f16_to_v128i8(<64 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_f16 v32, 0x200, v32 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_f16 v31, 0x200, v31 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v8, 0x200, v8 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v7, 0x200, v7 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v30, 0x200, v30 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v29, 0x200, v29 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v6, 0x200, v6 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v5, 0x200, v5 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] -; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v28, 0x200, v28 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v27, 0x200, v27 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v4, 0x200, v4 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v3, 0x200, v3 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v10, 0x200, v10 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v12, 0x200, v12 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v14, 0x200, v14 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v13, 0x200, v13 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v11, 0x200, v11 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v9, 0x200, v9 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v18, 0x200, v18 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v17, 0x200, v17 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v20, 0x200, v20 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v19, 0x200, v19 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v22, 0x200, v22 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v21, 0x200, v21 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v24, 0x200, v24 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v23, 0x200, v23 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v26, 0x200, v26 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v25, 0x200, v25 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, 0x200, v2 op_sel_hi:[0,1] ; GFX11-TRUE16-NEXT: v_pk_add_f16 v1, 0x200, v1 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v16, 0x200, v16 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_pk_add_f16 v15, 0x200, v15 op_sel_hi:[0,1] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX11-TRUE16-NEXT: .LBB94_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 @@ -208055,69 +207467,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr166_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr165_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr164_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr163_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr162_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr161_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr151_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr149_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr147_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr131_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr119_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr145_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr135_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr133_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr129_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr117_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr115_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr160_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr150_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr148_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr146_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr144_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr134_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr132_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr130_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr128_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr118_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr116_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr114_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v33 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 @@ -208128,69 +207540,69 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_4 @@ -208199,405 +207611,364 @@ define <128 x i8> @bitcast_v64i16_to_v128i8(<64 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_pk_add_u16 v32, v32, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_pk_add_u16 v31, v31, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v8, v8, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v7, v7, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v30, v30, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v29, v29, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v6, v6, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v5, v5, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] -; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v28, v28, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v27, v27, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v4, v4, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v3, v3, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v10, v10, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v12, v12, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v14, v14, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v13, v13, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v11, v11, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v9, v9, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v18, v18, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v17, v17, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v20, v20, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v19, v19, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v22, v22, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v21, v21, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v24, v24, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v23, v23, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v26, v26, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v25, v25, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v2, v2, 3 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_pk_add_u16 v1, v1, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v16, v16, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_pk_add_u16 v15, v15, 3 op_sel_hi:[1,0] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[33:34], 24, v[31:32] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[34:35], 24, v[29:30] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[7:8] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[13:14] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[11:12] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[64:65], 24, v[9:10] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[69:70], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[35:36], 24, v[27:28] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[15:16] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[66:67], 24, v[5:6] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[15:16] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[70:71], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[36:37], 24, v[25:26] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[49:50], 24, v[13:14] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[52:53], 24, v[11:12] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[67:68], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[37:38], 24, v[23:24] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[68:69], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[38:39], 24, v[21:22] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[50:51], 24, v[19:20] -; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[53:54], 24, v[17:18] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[48:49], 24, v[23:24] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[51:52], 24, v[21:22] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[54:55], 24, v[19:20] +; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[65:66], 24, v[17:18] +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 24, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 8, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 8, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 24, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 8, v5 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 24, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v2 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 8, v32 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v31 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v30 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 8, v29 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v85, 8, v28 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v27 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 24, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v26 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v25 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v24 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 8, v23 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 24, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 8, v22 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 24, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 8, v20 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 24, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v18 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v149, 8, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v151, 8, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v161, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v162, 8, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v163, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v164, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v165, 8, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v166, 8, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 24, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 8, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v86, 8, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 8, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 8, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 24, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 8, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 8, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 24, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 8, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 8, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 24, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 8, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v148, 8, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v150, 24, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v160, 8, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v17 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v166.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v68.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v165.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v39.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v164.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v39, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v2.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v163.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v69.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v161.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v1.h, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v160.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v68, 0xffff, v68 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v54, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v39, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v3.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v162.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v161.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v150.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v68, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v149.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v66.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v148.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v54, v51 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v39, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v4.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v151.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v39, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v5.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v149.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v147.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v5.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v147.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v5.h, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v67 +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v39, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v6.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v145.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v67.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v146.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v144.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v65.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v66 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v7.l, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v131.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v8.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v130.l -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v54, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v64.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v119.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v65 -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v118.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v10.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v116.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v54, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v64 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v11.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v11.h, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v52, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v54 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v99.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v13.l, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v49, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v14.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v49, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v48, 0xffff, v52 -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v84.l -; GFX11-TRUE16-NEXT: v_or_b16 v17.l, v17.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v39 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v48, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.h -; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v145.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v39, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v7.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v135.l -; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v19.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v134.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v133.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v18.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v20.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v133.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v39, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v8.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v131.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v39, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v9.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v129.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v19.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v128.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v39, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v119.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v39, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v10.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v117.l -; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v20.h, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v22.l, v22.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v39, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v11.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v115.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v48 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v21.l, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v114.l -; GFX11-TRUE16-NEXT: v_or_b16 v23.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v113.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v39, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v12.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v103.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v50.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v39, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v13.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v99.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v39, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v14.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v97.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v38.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v39, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v15.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v85.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, 0xff, v17.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v39, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v16.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v39.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.l, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v18.h, 0xff, v18.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v39, v16 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v17.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v17.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v160.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v150.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.l, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v19.h, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, v39, v17 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v18.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v18.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v148.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v18, v39, v18 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v19.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v19.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v146.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v144.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, v39, v19 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v20.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v20.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v134.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.l, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_and_b16 v22.h, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v20, v39, v20 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v21.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.h, v21.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v132.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v130.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v23.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v39, v21 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v22.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v128.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v39, v22 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v23.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v23.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v118.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v116.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v25.h, 0xff, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v39, v23 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v24.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v114.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v39, v24 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v25.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v102.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v27.h, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v39, v25 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v26.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v26.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v100.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v39, v26 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v27.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.h, v27.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v96.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v39, v27 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v28.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v28.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v86.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v34.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_and_b16 v30.h, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v39, v28 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v29.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v29.h, v29.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v83.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, v38, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v22.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v23.l, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_and_b16 v23.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v113.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, v38, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v39 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v23.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v23.h, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v31.h, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v39, v29 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v30.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v30.h, v30.h, v34.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v82.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v33.l ; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v32.h, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v39, v30 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v31.l, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v31.h, v31.h, v33.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.l, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, v37, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v25.l, v25.l, v33.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v100.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v98.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v37, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v24.l, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v25.l, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v26.l, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v38 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, v37, v51 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v25.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v26.l -; GFX11-TRUE16-NEXT: v_and_b16 v26.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v26.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v86.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, v36, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v36, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v26.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v85.l -; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v26.h, v27.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v26, v36, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.l, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v27.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v37 -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v27.l, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v82.l -; GFX11-TRUE16-NEXT: v_or_b16 v29.l, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v34.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v27, v35, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v35, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v28.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.l, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v29.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_and_b16 v29.h, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.l, 8, v81.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v28, v35, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v36 -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v29.l, v33.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_or_b16 v30.l, v29.h, v30.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v29, v34, v51 -; GFX11-TRUE16-NEXT: v_or_b16 v31.l, v31.l, v33.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v30.l -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v71.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v70.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v34, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v30.l, v30.h -; GFX11-TRUE16-NEXT: v_and_b16 v31.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v31.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v32.l, v32.l, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v35 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v30, v34, v51 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v31.l, v31.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.l, v32.l -; GFX11-TRUE16-NEXT: v_and_b16 v32.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v32.h, 8, v55.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v33, v51 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v33, 0xffff, v34 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v51.h, v32.l, v32.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v33, v51 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v31, v39, v31 +; GFX11-TRUE16-NEXT: v_or_b16 v39.l, v32.l, v33.l +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v32.h, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, v39.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v32, v39, v32 ; GFX11-TRUE16-NEXT: s_clause 0x5 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[9:12], off offset:32 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[13:16], off offset:48 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll index 3e96ab1d597d..21ec3ee1996a 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll @@ -4118,19 +4118,19 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -4144,103 +4144,95 @@ define <4 x i32> @bitcast_v16i8_to_v4i32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8592,19 +8584,19 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -8618,103 +8610,95 @@ define <4 x float> @bitcast_v16i8_to_v4f32(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -12682,19 +12666,19 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -12708,103 +12692,95 @@ define <2 x i64> @bitcast_v16i8_to_v2i64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16382,19 +16358,19 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -16408,103 +16384,95 @@ define <2 x double> @bitcast_v16i8_to_v2f64(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19811,19 +19779,19 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -19837,103 +19805,95 @@ define <8 x i16> @bitcast_v16i8_to_v8i16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -22725,19 +22685,19 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -22751,103 +22711,95 @@ define <8 x half> @bitcast_v16i8_to_v8f16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -24944,19 +24896,19 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v12.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.h, 8, v7.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v15.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v16 @@ -24970,103 +24922,95 @@ define <8 x bfloat> @bitcast_v16i8_to_v8bf16(<16 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v8.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v1.h, v6.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v1.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v2.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v3.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v9 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v5.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v11, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v2.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v11, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v11.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v11, v3 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v10.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v9.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v12.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v6.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v8.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v9.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v7.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v6.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v9, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v5.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.h, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v9, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v9.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v4.h, v2.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v4, v11 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v9, v3 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll index f8ffaa456c2b..38302a75fe26 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll @@ -6296,32 +6296,31 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 @@ -6333,194 +6332,175 @@ define <8 x i32> @bitcast_v32i8_to_v8i32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13335,32 +13315,31 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 @@ -13372,194 +13351,175 @@ define <8 x float> @bitcast_v32i8_to_v8f32(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -19892,32 +19852,31 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 @@ -19929,194 +19888,175 @@ define <4 x i64> @bitcast_v32i8_to_v4i64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -25939,32 +25879,31 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v19.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v12.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v22.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v21.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v23.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v31.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v32 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 @@ -25976,194 +25915,175 @@ define <4 x double> @bitcast_v32i8_to_v4f64(<32 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v19.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v19.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v0.h, v15.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v12.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v1.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v2.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v3.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v4.l, v10.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v0.l, v19.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v19.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v1.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v2.l, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v3.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v4.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v5.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v9.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v6.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v19.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v5.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v6.l, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v21.l, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v19 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v21.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v19.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v18.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v15.l, 3 ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v17.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v19.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v19.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v18.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v17.h, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v15.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v13.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v15.h, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v16.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v12.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v21, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v14.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v14.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v14.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v13.l, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v14.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v12.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v11.h, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v21 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v21, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v21.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v12.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v13.l, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v21, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v11.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v11.h, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v10.h, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v11.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v9.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v21, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v10.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v10.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v21, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v9.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v9.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v9.h, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v8.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v21 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v21 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v8, v21 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v21, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v8.h, v6.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v21, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v21.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v21.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v7 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll index 0cefbc1c2dee..436b1a038b27 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll @@ -2966,20 +2966,20 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -2995,17 +2995,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_2: ; %Flow @@ -3029,17 +3029,17 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB12_4: ; %end @@ -3047,105 +3047,93 @@ define <40 x i8> @bitcast_v10i32_to_v40i8(<10 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10i32_to_v40i8: @@ -5038,48 +5026,49 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB14_3 @@ -5092,245 +5081,217 @@ define <10 x i32> @bitcast_v40i8_to_v10i32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB14_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2 ; GFX11-TRUE16-NEXT: .LBB14_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9951,20 +9912,20 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -9980,17 +9941,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB32_2: ; %Flow @@ -10010,17 +9971,17 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[13:14], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[14:15], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB32_4: ; %end @@ -10028,105 +9989,93 @@ define <40 x i8> @bitcast_v10f32_to_v40i8(<10 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v10f32_to_v40i8: @@ -12037,48 +11986,49 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v23.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v17.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v30.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v28.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v30.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v29.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.h, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v29.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v33.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v33.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v33.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v34.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v35.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v35.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v36 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB34_3 @@ -12091,245 +12041,217 @@ define <10 x float> @bitcast_v40i8_to_v10f32(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB34_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v25.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v24.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v23.h -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v0.h, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v19.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v18.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v17.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v1.h, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v2.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v3.l, v15.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v21.h +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v0.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v23.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v19.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v27, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v1.h, v23.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v27, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v2.l, v20.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v27, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v14.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v27, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v4.l, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v27, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v5.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v27, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v6.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v4.l, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v14.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v5.l, v13.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v13.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v6.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v25 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v7.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v25 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v8.l, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v25 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v27, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v7.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v27, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v8.l, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v27.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v27, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v27.l, v9.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v27.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v25 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v27, v9 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2 ; GFX11-TRUE16-NEXT: .LBB34_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v26.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v25.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v25.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v22.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v23.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v22.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v21.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v18.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v20.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v21.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v24.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v25.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v23.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v23.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v19.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v19.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v17.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v25, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v20.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v21.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v21.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v19.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v17.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v27 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v19.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v16.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v15.h, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v27 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v18.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v25.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v17.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v18.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v16.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v18.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v25, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v15.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v15.h, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v14.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v15.l, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v27 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v13.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v25, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v14.l, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v14.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v25, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v13.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v13.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v13.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v12.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v27 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v25, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v12.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v27 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v12.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v25, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v11.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v11.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v11.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v11.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v10.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v27 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v27 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v27.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v27 +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v25, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.h, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v25, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v25.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v25, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16358,20 +16280,20 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -16387,17 +16309,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow @@ -16421,17 +16343,17 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_4: ; %end @@ -16439,105 +16361,93 @@ define <40 x i8> @bitcast_v20i16_to_v40i8(<20 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20i16_to_v40i8: @@ -22479,20 +22389,20 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -22508,17 +22418,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB60_2: ; %Flow @@ -22542,17 +22452,17 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB60_4: ; %end @@ -22560,105 +22470,93 @@ define <40 x i8> @bitcast_v20f16_to_v40i8(<20 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v20f16_to_v40i8: @@ -28859,50 +28757,51 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB72_3 @@ -28915,245 +28814,216 @@ define <5 x double> @bitcast_v40i8_to_v5f64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB72_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2 ; GFX11-TRUE16-NEXT: .LBB72_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -30908,20 +30778,20 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -30937,17 +30807,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB74_2: ; %Flow @@ -30966,17 +30836,17 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB74_4: ; %end @@ -30984,105 +30854,93 @@ define <40 x i8> @bitcast_v5f64_to_v40i8(<5 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5f64_to_v40i8: @@ -33010,50 +32868,51 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:24 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:32 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:20 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v29.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v27.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v25.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.h, v21.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v16.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v34.h, 8, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v39.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v39.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v33.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v21.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v39.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v38.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v36.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v36.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v36.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v37.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v38.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v49 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB76_3 @@ -33066,245 +32925,216 @@ define <5 x i64> @bitcast_v40i8_to_v5i64(<40 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB76_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v34.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v34.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v33.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v0.h, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v23.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v27.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v1.h, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v2.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v20.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v22.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v3.l, v23.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v29.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v0.l, v34.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v34.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v33.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v1.h, v33.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v2.l, v28.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v10.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v3.l, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v26.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v5.l, v21.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v31.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v6.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v4.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v21.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v5.l, v19.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v19.l -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v6.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v7.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v11, v10 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v8.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v16.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v9.l, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v7.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v9.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB76_2 ; GFX11-TRUE16-NEXT: .LBB76_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v35.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v34.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v35.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v30.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v29.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v26.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.l, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v28.h, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v29.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v34.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v34.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v33.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v33.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v27.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v27.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v25.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v10, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v28.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v29.l, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v29.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v27.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v25.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v27.h, v1.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v23.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v23.h, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v11 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v25.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v26.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v21.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v20.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v10, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v20.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v22.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v23.l, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v23.h, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v5, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v21.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v22.h, v4.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v22.l, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.l, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v19.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v10, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v21.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v22.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v10, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v19.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v19.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v19.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v18.h, v6.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v7, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v18.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v18.l, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v17.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v17.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v31.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v7.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v7.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v16.h, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v11 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v11 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v31.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v17.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v17.h, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v16.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v16.h, v8.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v10, v9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -35074,20 +34904,20 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr15_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr14_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr13_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr12_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr11_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 @@ -35103,17 +34933,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB78_2: ; %Flow @@ -35140,17 +34970,17 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[15:16], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 8, v1 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %end @@ -35158,105 +34988,93 @@ define <40 x i8> @bitcast_v5i64_to_v40i8(<5 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v30.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v1.l, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v15.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v15, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v2.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v14.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v1.h, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v15, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v3.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v28, 0xffff, v29 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v16, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v11.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v28, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v14, v15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v15, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v4.l, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v15, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v5.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v15.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v15, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v7.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v14, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v13, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v7.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v12, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v11.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v11.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v13 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v15 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v17.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v11, v15 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v11, v15 +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v15, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v8.l, v11.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v11.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v15, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v11.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v9.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v15.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v15, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v9.l, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v15.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v10.l, v10.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v9 ; GFX11-TRUE16-NEXT: s_clause 0x2 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 -; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[10:11], off offset:32 +; GFX11-TRUE16-NEXT: scratch_store_b64 v0, v[11:12], off offset:32 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: bitcast_v5i64_to_v40i8: diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll index 48c9b8775a47..8e30ee659a26 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll @@ -2257,8 +2257,8 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -2273,19 +2273,17 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true @@ -2295,16 +2293,14 @@ define i32 @bitcast_v4i8_to_i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -4506,8 +4502,8 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -4522,19 +4518,17 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB42_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB42_2 ; GFX11-TRUE16-NEXT: .LBB42_4: ; %cmp.true @@ -4544,16 +4538,14 @@ define float @bitcast_v4i8_to_f32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -6467,8 +6459,8 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -6483,19 +6475,17 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB58_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB58_2 ; GFX11-TRUE16-NEXT: .LBB58_4: ; %cmp.true @@ -6505,16 +6495,14 @@ define <2 x i16> @bitcast_v4i8_to_v2i16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -8116,8 +8104,8 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -8132,19 +8120,17 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true @@ -8154,16 +8140,14 @@ define <2 x half> @bitcast_v4i8_to_v2f16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -9479,8 +9463,8 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -9495,19 +9479,17 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB78_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB78_2 ; GFX11-TRUE16-NEXT: .LBB78_4: ; %cmp.true @@ -9517,16 +9499,14 @@ define <2 x bfloat> @bitcast_v4i8_to_v2bf16(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -10193,8 +10173,8 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v3.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v4 @@ -10209,19 +10189,17 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB82_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB82_2 ; GFX11-TRUE16-NEXT: .LBB82_4: ; %cmp.true @@ -10231,16 +10209,14 @@ define <1 x i32> @bitcast_v4i8_to_v1i32(<4 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll index 5aac06a7f3a2..35d135b12396 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll @@ -8768,32 +8768,32 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -8812,26 +8812,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB24_2: ; %Flow @@ -8864,26 +8864,26 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB24_4: ; %end @@ -8891,156 +8891,135 @@ define <64 x i8> @bitcast_v16i32_to_v64i8(<16 x i32> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -12470,15 +12449,15 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -12492,84 +12471,82 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB26_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -12581,384 +12558,338 @@ define <16 x i32> @bitcast_v64i8_to_v16i32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -23588,32 +23519,32 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -23632,26 +23563,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_2: ; %Flow @@ -23676,26 +23607,26 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB48_4: ; %end @@ -23703,156 +23634,135 @@ define <64 x i8> @bitcast_v16f32_to_v64i8(<16 x float> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -27413,15 +27323,15 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -27435,84 +27345,82 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB50_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -27524,384 +27432,338 @@ define <16 x float> @bitcast_v64i8_to_v16f32(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -37916,32 +37778,32 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -37960,26 +37822,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB68_2: ; %Flow @@ -38017,26 +37879,26 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB68_4: ; %end @@ -38044,156 +37906,135 @@ define <64 x i8> @bitcast_v8i64_to_v64i8(<8 x i64> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -41628,15 +41469,15 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -41650,84 +41491,82 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB70_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -41739,384 +41578,338 @@ define <8 x i64> @bitcast_v64i8_to_v8i64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -51295,32 +51088,32 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -51339,26 +51132,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB84_2: ; %Flow @@ -51383,26 +51176,26 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB84_4: ; %end @@ -51410,156 +51203,135 @@ define <64 x i8> @bitcast_v8f64_to_v64i8(<8 x double> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -54989,15 +54761,15 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v64, off, s32 offset:128 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v31, off, s32 offset:124 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v64, off, s32 offset:120 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:116 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v65, off, s32 offset:112 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v31, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v32, off, s32 offset:108 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v65, off, s32 offset:104 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:100 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v66, off, s32 offset:96 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v32, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v33, off, s32 offset:92 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v66, off, s32 offset:88 -; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:132 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:132 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v67, off, s32 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v67, off, s32 offset:8 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v68, off, s32 offset:16 @@ -55011,84 +54783,82 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v80, off, s32 offset:80 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v33, off, s32 offset:84 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v34, off, s32 offset:76 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:68 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v34, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v35, off, s32 offset:60 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v35, off, s32 offset:52 ; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v36, off, s32 offset:44 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:36 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:28 -; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v36, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v37, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:20 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v37, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_d16_b16 v38, off, s32 offset:12 ; GFX11-TRUE16-NEXT: scratch_load_d16_hi_b16 v38, off, s32 offset:4 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v80.h, v29.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.l, v27.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.h, v25.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v24.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.l, v20.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v18.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v50.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v51.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.l, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v0.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.h, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v52.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v49.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.l, 8, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 8, v25.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v81.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.h, 8, v80.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v55.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v53.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v54.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v51.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.l, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v50.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v39.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v48.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v27.h, 8, v27.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 8, v80.h ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(33) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v64.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v64.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v65.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v65.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v65.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v65.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v66.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v66.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v66.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v66.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v67.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 8, v67.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v24.l, 8, v67.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v67.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v68.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.h, 8, v68.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 8, v68.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v68.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v69.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 8, v69.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 8, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v69.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v70.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.h, 8, v70.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 8, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v70.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v71.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 8, v71.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.l, 8, v71.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.l, 8, v71.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v82 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v19.h, 8, v80.l +; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v81 ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execnz .LBB86_3 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %Flow @@ -55100,384 +54870,338 @@ define <8 x double> @bitcast_v64i8_to_v8f64(<64 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v55.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v54.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v55.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v51.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v54.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v53.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v53.l -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v0.h, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v52.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v51.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.l, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v1.h, v50.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v39.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v27.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v2.l, v49.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v6, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v3.l, v48.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v24.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v64.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v52.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v52.l +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v0.l, v54.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v55.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v54.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v49.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v64, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v1.h, v53.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v28.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v51.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v2.l, v51.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v48.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v39.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v48.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v64, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v3.l, v50.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v50.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v30.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v64, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v4.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v27.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v30.l ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v38.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v4.l, v29.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v4.h, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v38.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v64, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v5.l, v29.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v23.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v37.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v64, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v6.l, v27.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v22.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v36.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v64, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v7.l, v25.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v21.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v34.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v64, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v8.l, v23.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v19.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v33.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v64, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v9.l, v22.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v64.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v64, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v10.l, v21.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v31.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v5.l, v28.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v5.h, v26.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v38.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v6.l, v25.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v6.h, v24.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v37.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v7.l, v23.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v37.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v8.l, v22.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v35.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v11, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v9.l, v21.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v36.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v9.h, v21.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v12, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v10.l, v20.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v10.h, v20.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v11.l, v19.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v34.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v19.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v33.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v12.l, v18.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v12.h, v18.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v54 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v13.h, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v13.l, v17.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v54 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v14.l, v16.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v31.l +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v17, v54 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v54.h, v15.l, v16.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v64, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v11.l, v20.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v64.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v64, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v12.l, v19.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v64, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v13.l, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v64, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v14.l, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v64.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v64, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v64.l, v15.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v64.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr16_lo16 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v54 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_hi16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v64, v15 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v55.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v54.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v55.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v51.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v53.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v53.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v52.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v52.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v48.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v53.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v50.l, 3 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v52.l, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v54.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v55.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v54.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v53.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v49.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v49.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v48.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v52, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v39.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v51.l, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v51.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v52.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v50.h, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, v39.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, v30.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v49.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v51.l, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v48.l, v3.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v27.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v49.l, v3.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v27.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v3.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v52, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v52.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v50.l, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v50.h, v2.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v29.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v28.h, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v52, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v3.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v3.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, v26.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.h, v24.h, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v39.h, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v48.l, v3.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v39.l, v4.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v29.h, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, v25.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v4.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, 0x300, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v29.l, v5.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v28.l, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v28.h, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v26.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v5.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v52, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v29.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v30.h, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v24.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v26.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v52, v7 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v5.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, 0x300, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.l, v28.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, v30.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v27.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v27.h, v5.h ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v26.h, v6.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v52, v8 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v25.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v25.h, v6.h ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, v38.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v8, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v25.h, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, v30.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v6.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v24.l, v7.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v23.h, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v7.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v38.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v38.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v52, v9 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v7.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, 0x300, v7.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, v37.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.h, v37.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v23.l, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v23.h, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v22.h, v8.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, v36.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v23.l, v8.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, v37.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v8.l, 0x300, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v21.h, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v10, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v9.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v9.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v52, v10 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v22.l, v8.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v22.h, v8.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v36.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v36.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v52, v11 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v9.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, 0x300, v9.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.l, v35.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v10.h, v35.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v21.l, v9.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v21.h, v9.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v11, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v20.h, v10.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, v35.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v21.l, v10.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.h, v34.h, 3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, 0x300, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.h, 3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v13, v55 -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v19.h, v11.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v11.l, 0x300, v11.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v52, v12 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v20.l, v10.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v20.h, v10.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v34.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v34.l, 3 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v52, v13 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v11.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, 0x300, v11.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.l, v33.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v12.h, v33.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v19.l, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v19.h, v11.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v11.h -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v19.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v11.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v18.h, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v13, v55 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, v33.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, 0x300, v12.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v12.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.h, v32.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v52, v14 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v18.h, v12.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v32.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v32.l, 3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v15, v55 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v18.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v52, v15 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v18.h, 0x300, v13.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.h ; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.l, v31.h, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v15 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v17.l, v13.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v13.l, 0x300, v13.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v14.h, v31.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v17.l, v13.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v17.h, v13.h ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v17.h, v14.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.l, v31.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v13.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v16.h, v14.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.l, 0x300, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v55 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v14.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v17 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v55 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v55.h, 0x300, v15.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v16, v55 +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v52, v18 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v17.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v16.l, v14.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.h, v14.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v52, v17 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v52.l, 0x300, v15.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v15.h, 0x300, v15.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v52.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v52, v15 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -64573,32 +64297,32 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -64617,26 +64341,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB96_2: ; %Flow @@ -64669,26 +64393,26 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB96_4: ; %end @@ -64696,156 +64420,135 @@ define <64 x i8> @bitcast_v32i16_to_v64i8(<32 x i16> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -76701,32 +76404,32 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr25_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_lo16 @@ -76745,26 +76448,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB104_2: ; %Flow @@ -76797,26 +76500,26 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 24, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 24, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 8, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 24, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 8, v12 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 8, v11 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v35, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v36, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 24, v8 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v39, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v52, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 8, v1 ; GFX11-TRUE16-NEXT: .LBB104_4: ; %end @@ -76824,156 +76527,135 @@ define <64 x i8> @bitcast_v32f16_to_v64i8(<32 x half> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v64.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v18.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v55.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v23.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v25, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v54.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v53.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v54, 0xffff, v55 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v25, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v17.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v54, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v52.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.l, v3.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v51.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v23, v24 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v52.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v4.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v51.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v22.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v50.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff, v25 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v49.l ; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v6.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v8.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v39.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v38.l ; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v25.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v49.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v23, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v25 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v48.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v39.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v22, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v23 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v8.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v20.l ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v21, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v9.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v37.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v10.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v36.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v9.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v10.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v22 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v21, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v35.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.h -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v11.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v34.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v20, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v35.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v10.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v34.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v12.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v33.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 8, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v12.h, v18.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v18.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v14.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v14.h, v18.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v28.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v20, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v17.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v32.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v21 -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v19, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v20 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v17.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, 0xff, v16.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v17.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v26.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v18, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v15.l, v17.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v14.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v28.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v16.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v26.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 @@ -85692,59 +85374,59 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr28_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr113_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr24_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr26_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr112_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr27_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr103_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr102_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr23_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr29_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr101_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr30_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr100_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr99_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr22_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr32_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr98_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr31_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr33_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr97_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr96_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr21_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr35_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr34_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr84_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr36_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr20_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr80_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr39_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr71_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr70_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr19_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr52_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr55_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr65_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr64_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr18_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr68_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr53_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr66_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr85_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr69_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr54_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr67_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr51_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr87_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr50_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr17_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr82_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr38_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr81_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr37_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr83_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr49_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr86_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr48_lo16 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0 @@ -85757,307 +85439,302 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[21:22], 24, v[7:8] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v28.h, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v49.h, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v26.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v27.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v31.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v29.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v30.h, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v33.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v38.h, v7.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v35.h, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v34.h, v8.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v65.h, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v48.h, v10.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v71.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v54.h, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v13.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v68.h, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v66.h, v14.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v15.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v82.h, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v81.h, v16.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v36.h, v8.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v55.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v37.h, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v39.h, v10.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v70.h, v11.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v52.h, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v53.h, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v85.h, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v69.h, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v67.h, v14.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v87.h, v15.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v83.h, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v86.h, v16.h ; GFX11-TRUE16-NEXT: .LBB108_2: ; %Flow ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB108_4 ; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true +; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v17, 16, v2 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v4, 0x40c00000, v4 :: v_dual_add_f32 v17, 0x40c00000, v17 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v49, 0xffff0000, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v2, 0x40c00000, v2 :: v_dual_lshlrev_b32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v1, 0x40c00000, v1 :: v_dual_add_f32 v4, 0x40c00000, v4 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v17, 0x40c00000, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2 ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v2, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v2 -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff0000, v1 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v21, v2, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v20, v22, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v18, 0x40c00000, v18 :: v_dual_add_f32 v1, 0x40c00000, v1 -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v27.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v17, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v18, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v20, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v1, 16, 1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v26, v21, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v26.h +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v27, v21, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v1, 0x7fff ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v18, 0x7fff -; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v26 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfi_b32 v2, 0xffff, v2, v27 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v28, v20, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v4, 16, 1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v19 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 24, v2 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v28.h ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v17, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v4, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v4 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v3 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v19, 16, 1 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v18, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v20, 0xffff0000, v5 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v29, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v6 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v21, 0x40c00000, v21 :: v_dual_lshlrev_b32 v6, 16, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v3, 0x40c00000, v3 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v3, 0x40c00000, v3 :: v_dual_add_f32 v20, 0x40c00000, v20 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v30, v17, v23, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v21, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_cndmask_b32 v29, v17, v23 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v3, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v3 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3 ; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v29.h ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v3, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v17.l, v30.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v18, v19, vcc_lo +; GFX11-TRUE16-NEXT: v_dual_add_f32 v6, 0x40c00000, v6 :: v_dual_add_f32 v5, 0x40c00000, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 24, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 8, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v31, v18, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v20, v1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v32.h -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v3, v4, v23 :: v_dual_add_f32 v18, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v29 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v18, 0x40c00000, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v31.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v4, v23, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 +; GFX11-TRUE16-NEXT: v_bfi_b32 v4, 0xffff, v17, v30 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v6, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfi_b32 v3, 0xffff, v19, v3 -; GFX11-TRUE16-NEXT: v_bfe_u32 v21, v18, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 -; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v21, v18, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v6 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 8, v3 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 8, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v17, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v31, v19, v22 :: v_dual_and_b32 v20, 0xffff0000, v5 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_lshlrev_b32 v5, 16, v5 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v5, 0x40c00000, v5 :: v_dual_lshlrev_b32 v8, 16, v8 +; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v6, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v18 ; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v20, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v33.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 24, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 8, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v32, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v18, v18 +; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 8, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v33, v19, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_and_b32_e32 v19, 0xffff0000, v8 +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v8, 16, v8 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v5, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v5 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v20, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v8, 0x40c00000, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v17, v17, v5, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v17, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v32.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v34, v17, v21 :: v_dual_add_f32 v19, 0x40c00000, v19 ; GFX11-TRUE16-NEXT: v_bfe_u32 v17, v8, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v19 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v36.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v34.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v5, v19, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v20, v6, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v6, v17, v8, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v17, 0x400000, v8 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v8, v8 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v5, v5, v19, 0x7fff +; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v35, v6, v17, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v31 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v34, v5, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v6, 0xffff, v18, v33 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v36, v5, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_bfi_b32 v5, 0xffff, v21, v20 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v20, 16, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v7 ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v35.h -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 24, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 24, v6 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 8, v6 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_dual_add_f32 v20, 0x40c00000, v20 :: v_dual_add_f32 v7, 0x40c00000, v7 -; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v34 +; GFX11-TRUE16-NEXT: v_bfi_b32 v8, 0xffff, v8, v36 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 8, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v83, 24, v8 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v87, 8, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v82, 24, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v8 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v7, v20, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v49, v19, v21, vcc_lo -; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v38, v19, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v7, v7, v20, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v21, 0x400000, v20 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff0000, v10 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_add_f32 v17, 0x40c00000, v23 :: v_dual_add_f32 v10, 0x40c00000, v10 ; GFX11-TRUE16-NEXT: v_bfe_u32 v18, v17, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v17 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v17, v17 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v10, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v18, v18, v17, 0x7fff +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v10, 0x7fff -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v17, v18, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v20, v20 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v49.h -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v7, v21, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v38.h +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v37, v7, v21, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v10, v10 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v10, 16, v12 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff0000, v12 -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v48, v19, v22 :: v_dual_lshlrev_b32 v7, 16, v9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v7, 16, v9 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v39, v19, v22, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v10 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v39.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v37.h ; GFX11-TRUE16-NEXT: v_add_f32_e32 v12, 0x40c00000, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_bfe_u32 v22, v21, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v37, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v48, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 -; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v48 +; GFX11-TRUE16-NEXT: v_bfi_b32 v10, 0xffff, v19, v39 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v12, 16, 1 ; GFX11-TRUE16-NEXT: v_add3_u32 v22, v22, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v7 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v50, 0x400000, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v70, 24, v10 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v49 ; GFX11-TRUE16-NEXT: v_add3_u32 v24, v24, v12, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v54, v22, v37, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v22, v48 :: v_dual_add_f32 v9, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v7 :: v_dual_lshlrev_b32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v71, 24, v10 +; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_add_f32_e32 v14, 0x40c00000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v7, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v22, 0xffff0000, v14 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v14, 16, v14 -; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff -; GFX11-TRUE16-NEXT: v_and_b32_e32 v38, 0xffff0000, v11 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v54.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_add_f32 v14, 0x40c00000, v14 :: v_dual_lshlrev_b32 v11, 16, v11 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v65, v19, v25, vcc_lo -; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v9 -; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v38 -; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v80, 8, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v52, v24, v50 :: v_dual_add_f32 v9, 0x40c00000, v23 +; GFX11-TRUE16-NEXT: v_add3_u32 v19, v19, v7, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v52.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v55, v19, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v12, v12 ; GFX11-TRUE16-NEXT: v_bfe_u32 v19, v21, 16, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v52 -; GFX11-TRUE16-NEXT: v_bfe_u32 v20, v9, 16, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v23, 0x400000, v9 +; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v14, 16, 1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v53, v24, v50, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v9, v9 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_bfi_b32 v12, 0xffff, v7, v53 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v11 ; GFX11-TRUE16-NEXT: v_add3_u32 v11, v19, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_add3_u32 v20, v20, v9, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v21 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v55, 24, v12 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v67, 8, v12 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v9, v20, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v55.h ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v20.l, v65.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v65, 24, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v66, 8, v12 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v11, v11, v19, vcc_lo ; GFX11-TRUE16-NEXT: v_add_f32_e32 v19, 0x40c00000, v22 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v23, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v7 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v23, 0xffff0000, v13 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v19, 16, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v13, 16, v13 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_add_f32_e32 v7, 0x40c00000, v23 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v71, v21, v22, vcc_lo +; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 +; GFX11-TRUE16-NEXT: v_dual_add_f32 v7, 0x40c00000, v23 :: v_dual_cndmask_b32 v70, v21, v22 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v21, v24, v19, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v22, 0x400000, v19 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v19, v19 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v25, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 ; GFX11-TRUE16-NEXT: v_bfe_u32 v25, v7, 16, 1 -; GFX11-TRUE16-NEXT: v_dual_add_f32 v13, 0x40c00000, v13 :: v_dual_cndmask_b32 v66, v21, v22 +; GFX11-TRUE16-NEXT: v_add_f32_e32 v13, 0x40c00000, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v67, v21, v22, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v19, 0x400000, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_add3_u32 v14, v25, v7, 0x7fff ; GFX11-TRUE16-NEXT: v_and_b32_e32 v21, 0xffff0000, v16 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v16, 16, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v68, v23, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v69, v23, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v7, v7 ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v13, 16, 1 -; GFX11-TRUE16-NEXT: v_bfi_b32 v9, 0xffff, v20, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v81, 8, v9 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v16, 0x40c00000, v16 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v68.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v22.l, v69.h ; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v14, v19 :: v_dual_add_f32 v14, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v21, 16, v15 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v23, v13, 0x7fff @@ -86067,42 +85744,42 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_add_f32_e32 v21, 0x40c00000, v21 ; GFX11-TRUE16-NEXT: v_and_b32_e32 v15, 0xffff0000, v15 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v25, 0x400000, v16 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v23, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v19, v23, vcc_lo ; GFX11-TRUE16-NEXT: v_add3_u32 v13, v13, v16, 0x7fff ; GFX11-TRUE16-NEXT: v_bfe_u32 v23, v21, 16, 1 ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v16, v16 ; GFX11-TRUE16-NEXT: v_bfe_u32 v24, v14, 16, 1 ; GFX11-TRUE16-NEXT: v_add_f32_e32 v15, 0x40c00000, v15 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v38, 0x400000, v21 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v49, 0x400000, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v23, v23, v21, 0x7fff -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v82, v13, v25, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v83, v13, v25, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v21, v21 ; GFX11-TRUE16-NEXT: v_add3_u32 v19, v24, v14, 0x7fff ; GFX11-TRUE16-NEXT: v_or_b32_e32 v24, 0x400000, v14 -; GFX11-TRUE16-NEXT: v_bfe_u32 v37, v15, 16, 1 +; GFX11-TRUE16-NEXT: v_bfe_u32 v48, v15, 16, 1 ; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, 0x400000, v15 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v85, v23, v38, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v87, v23, v49, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v14, v14 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v86.h -; GFX11-TRUE16-NEXT: v_add3_u32 v13, v37, v15, 0x7fff -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v71.h -; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v66 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v81, v19, v24, vcc_lo +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v21.l, v85.h +; GFX11-TRUE16-NEXT: v_add3_u32 v13, v48, v15, 0x7fff +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v23.l, v70.h +; GFX11-TRUE16-NEXT: v_bfi_b32 v14, 0xffff, v22, v67 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v86, v19, v24, vcc_lo ; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v15, v15 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v82.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v85.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v83.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v87.h ; GFX11-TRUE16-NEXT: v_bfi_b32 v11, 0xffff, v23, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 24, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 24, v14 ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v13, v13, v16, vcc_lo -; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v81 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v53, 8, v14 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v69, 8, v11 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v84, 8, v9 +; GFX11-TRUE16-NEXT: v_bfi_b32 v16, 0xffff, v19, v86 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v54, 8, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v68, 8, v11 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_bfi_b32 v15, 0xffff, v15, v13 ; GFX11-TRUE16-NEXT: v_bfi_b32 v13, 0xffff, v21, v7 ; GFX11-TRUE16-NEXT: v_bfi_b32 v7, 0xffff, v18, v17 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v37, 24, v16 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v38, 8, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v48, 24, v16 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v49, 8, v16 ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[17:18], 24, v[15:16] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[18:19], 24, v[13:14] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[19:20], 24, v[11:12] @@ -86111,159 +85788,142 @@ define <64 x i8> @bitcast_v32bf16_to_v64i8(<32 x bfloat> %a, i32 %b) { ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[22:23], 24, v[5:6] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[23:24], 24, v[3:4] ; GFX11-TRUE16-NEXT: v_lshrrev_b64 v[24:25], 24, v[1:2] -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v51, 8, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v50, 8, v15 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v64, 8, v13 -; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 8, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 8, v7 ; GFX11-TRUE16-NEXT: .LBB108_4: ; %end ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v28.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v113.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v27.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v112.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v26.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.h, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v103.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v32.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v1.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v102.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.l, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v30.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v101.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v24 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v24.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v24.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v27.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v26.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v112.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v103.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v3.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v2.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v8, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v6 -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v36.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v3.h, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v24, v1 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v2.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v31.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v102.l +; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v4.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v101.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v24, v2 +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v30.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v3.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v29.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v99.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v100.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v3.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v8, v24 +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v24, v3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v4.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v5.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v34.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v99.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v5.h, v6.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v98.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v4.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v33.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v97.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v6.l, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v49.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v5.l, v5.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v31.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v6.h, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v10, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v24, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v33.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v5.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v32.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v97.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v6.l, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v35.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v87.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v10, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v10, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v7.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v65.h -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v7.h, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v84.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v69.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v10, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v83.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v34.h -; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v24, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v6.l, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v7.l, 0xff, v38.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v96.l +; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v7.h, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v84.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v24, v6 +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v36.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v7.l, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v35.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v82.l +; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v20.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v24, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v8.l, v8.h +; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v9.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v55.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v81.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.h, v9.h, v10.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 8, v80.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v8.l, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v8.h, v10.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v71.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v9.l, v9.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v10.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v48.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v70.l -; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v10.h, v11.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v14, v24 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v67.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v14, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v10.l, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v54.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v11.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v64.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h -; GFX11-TRUE16-NEXT: v_and_b16 v12.h, 0xff, v86.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v14, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v16 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v11.l, v11.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v24, v8 +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v39.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v9.l, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v37.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v71.l +; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v11.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v19.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v24, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v10.l, v10.h +; GFX11-TRUE16-NEXT: v_or_b16 v10.h, v11.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v11.l, 0xff, v70.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.l, 8, v68.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.h, v11.h, v12.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v66.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v10, v24, v10 +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v53.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v11.l, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v24.h ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v52.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v55.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v12.h, v13.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v65.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v13.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v12.l, v14.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v13.h -; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v68.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v53.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v16, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v16, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v13.l, v14.h -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v85.h -; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v13.h, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.l, 8, v51.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v16, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v11, v24, v11 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v13.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v13.l, 0xff, v85.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.l, 8, v64.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v13.h, v14.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v14.h, 8, v54.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v12, v24, v12 +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v67.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v13.l, v14.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v69.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v51.l +; GFX11-TRUE16-NEXT: v_and_b16 v15.h, 0xff, v15.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v13, v24, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v14.l, v14.h +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v15.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v87.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.l, 8, v50.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v14.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.l, 0xff, v66.h -; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v14.h, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, 0xff, v82.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v38.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v18, 0xffff, v18 -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v14.l, v16.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v19.l, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v15.l, 0xff, v15.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v15.h, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v14.h, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v18, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v19 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v37.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v15.l, v15.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v18.l, v16.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v81.h +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v15.h, v16.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 8, v49.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v14, v24, v14 +; GFX11-TRUE16-NEXT: v_and_b16 v17.l, 0xff, v86.h +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v15.l, v16.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.l, v24.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, 0xff, v83.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 8, v48.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v17, v24 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v17, 0xffff, v18 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v16.l, v16.h -; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v17, v24 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v15, v24, v15 +; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v16.l, v16.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.l, v17.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v16.l, v24.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v16, v24, v16 ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[1:4], off ; GFX11-TRUE16-NEXT: scratch_store_b128 v0, v[5:8], off offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll index 6fe66655de3d..4c485768bcbb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.64bit.ll @@ -3065,13 +3065,12 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -3085,61 +3084,53 @@ define i64 @bitcast_v8i8_to_i64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB26_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB26_2 ; GFX11-TRUE16-NEXT: .LBB26_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6214,13 +6205,12 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -6234,61 +6224,53 @@ define double @bitcast_v8i8_to_f64(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB50_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB50_2 ; GFX11-TRUE16-NEXT: .LBB50_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -9063,13 +9045,12 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -9083,61 +9064,53 @@ define <2 x i32> @bitcast_v8i8_to_v2i32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB70_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB70_2 ; GFX11-TRUE16-NEXT: .LBB70_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -11603,13 +11576,12 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -11623,61 +11595,53 @@ define <2 x float> @bitcast_v8i8_to_v2f32(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB86_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB86_2 ; GFX11-TRUE16-NEXT: .LBB86_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -13829,13 +13793,12 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -13849,61 +13812,53 @@ define <4 x i16> @bitcast_v8i8_to_v4i16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB98_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB98_2 ; GFX11-TRUE16-NEXT: .LBB98_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -15655,13 +15610,12 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -15675,61 +15629,53 @@ define <4 x half> @bitcast_v8i8_to_v4f16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB106_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB106_2 ; GFX11-TRUE16-NEXT: .LBB106_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -16966,13 +16912,12 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v7.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v5.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v7.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v8 @@ -16986,61 +16931,53 @@ define <4 x bfloat> @bitcast_v8i8_to_v4bf16(<8 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; GFX11-TRUE16-NEXT: .LBB110_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v3.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.h, v4.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v2.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v4 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v0.l, v2.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v1.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v5, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB110_2 ; GFX11-TRUE16-NEXT: .LBB110_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v5.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v3.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v3.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v4.h, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.h, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v2, v5 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v2.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v3.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v3.h, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.l, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.h, 0x300, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v4.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll index e5245f7bd71d..879e8520d8e1 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.96bit.ll @@ -1102,15 +1102,16 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -1125,80 +1126,74 @@ define <3 x i32> @bitcast_v12i8_to_v3i32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB6_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB6_2 ; GFX11-TRUE16-NEXT: .LBB6_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -4241,15 +4236,16 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v8.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v8.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v3.h, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -4264,80 +4260,74 @@ define <3 x float> @bitcast_v12i8_to_v3f32(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB22_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v7.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v5.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v5.l -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v4, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v3.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.h +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v0.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v5.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v7 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v1.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v7.l, v2.l, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr3_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB22_2 ; GFX11-TRUE16-NEXT: .LBB22_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v7.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v7.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v6.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v6.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.h, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v5.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v5.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v4.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v4.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v3.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v7, v5 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v4, v7 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v3.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v7, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v7, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -6885,16 +6875,16 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -6909,80 +6899,74 @@ define <6 x bfloat> @bitcast_v12i8_to_v6bf16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB36_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB36_2 ; GFX11-TRUE16-NEXT: .LBB36_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -8651,16 +8635,16 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -8675,80 +8659,74 @@ define <6 x half> @bitcast_v12i8_to_v6f16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB40_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB40_2 ; GFX11-TRUE16-NEXT: .LBB40_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -10065,16 +10043,16 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v9.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v7.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v5.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v5.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v10.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v9.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v11.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v10.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v9.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v12 @@ -10089,80 +10067,74 @@ define <6 x i16> @bitcast_v12i8_to_v6i16(<12 x i8> %a, i32 %b) { ; GFX11-TRUE16-NEXT: .LBB44_3: ; %cmp.false ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v8.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v7.l -; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v5.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v7.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v6.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v6.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v5.h +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v10.l ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr9_lo16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v1.l, v4.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v5 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v6, v7 -; GFX11-TRUE16-NEXT: v_or_b16 v7.h, v2.l, v4.l -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 -; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v7 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_hi16 ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr7_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr8_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr10_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr6_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v0 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v1.h, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v4.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_hi16 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v5 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v2.l, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_lo16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr5_hi16 +; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr4_lo16 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB44_2 ; GFX11-TRUE16-NEXT: .LBB44_4: ; %cmp.true ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v9.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v8.h, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, v7.h, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v8.l, 3 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v6.l, 3 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, v7.l, 3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.l, v8.l, 3 ; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, v10.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v7.l, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v6.l, v0.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v6.h, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v5.l, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x300, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v5.h, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.l, 0 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x300, v0.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v0.h -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.l -; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.l, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v4.h, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v2.h, 0xff, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v6 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v1.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v1.h, 0x300, v1.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v5, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v1.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v9 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v9.h, 0x300, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v9 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v4.h, v2.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v3.l, 0x300, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_nc_u16 v2.h, 0x300, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll index 89fc6c062c29..d6922bc09ff0 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-llvm-debuginfo-analyzer.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc %s -o %t.o -mcpu=gfx1030 -filetype=obj -O0 ; RUN: llvm-debuginfo-analyzer %t.o --print=all --attribute=all | FileCheck %s diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 4cc39d93854a..1d3368b036d0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -9022,13 +9022,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1164-TRUE16-NEXT: .LBB15_2: ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9101,13 +9100,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1132-TRUE16-NEXT: .LBB15_2: ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9180,13 +9178,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s7, v0 ; GFX1264-TRUE16-NEXT: .LBB15_2: ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: v_cndmask_b16 v0.l, s6, 0, vcc -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9259,13 +9256,12 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s6, v0 ; GFX1232-TRUE16-NEXT: .LBB15_2: ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s5 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: v_cndmask_b16 v0.l, s4, 0, vcc_lo -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_or_b16 v0.l, s2, v0.l ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -9662,12 +9658,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1164-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1164-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1164-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1164-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1164-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9789,12 +9784,11 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1132-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1132-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1132-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1132-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1132-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 @@ -9916,13 +9910,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s11, v2 ; GFX1264-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1264-TRUE16-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1264-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1264-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1264-TRUE16-NEXT: v_mad_u16 v0.l, s10, v4.l, s2 ; GFX1264-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1264-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10048,13 +10041,12 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v2 ; GFX1232-TRUE16-NEXT: .LBB16_4: ; %Flow ; GFX1232-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s9 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1232-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX1232-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-TRUE16-NEXT: s_wait_alu 0xf1ff -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_mad_u16 v0.l, s8, v4.l, s2 ; GFX1232-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX1232-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], null @@ -10734,15 +10726,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1164-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1164-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1164-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1164-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1164-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1164-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1164-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1164-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1164-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1164-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10828,14 +10820,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1132-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1132-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1132-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1132-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1132-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1132-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1132-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1132-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 -; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX1132-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1132-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], 0 glc ; GFX1132-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -10920,15 +10912,15 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1264-TRUE16-NEXT: s_mov_b64 s[2:3], 0 ; GFX1264-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1264-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s9, v1 +; GFX1264-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1264-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1264-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1264-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s9, v0 -; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_and_or_b32 v0, v1, s10, v0 ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v3, v1 +; GFX1264-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX1264-TRUE16-NEXT: v_mov_b32_e32 v2, v0 ; GFX1264-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1264-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -11014,14 +11006,14 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp ; GFX1232-TRUE16-NEXT: s_mov_b32 s6, -1 ; GFX1232-TRUE16-NEXT: .LBB18_1: ; %atomicrmw.start ; GFX1232-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX1232-TRUE16-NEXT: v_lshrrev_b32_e32 v0, s2, v1 +; GFX1232-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1232-TRUE16-NEXT: v_add_f16_e32 v0.l, s8, v0.l ; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1232-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX1232-TRUE16-NEXT: v_lshlrev_b32_e32 v0, s2, v0 -; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX1232-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1232-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0 ; GFX1232-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[2:3], off, s[4:7], null th:TH_ATOMIC_RETURN scope:SCOPE_SYS ; GFX1232-TRUE16-NEXT: s_wait_loadcnt 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll new file mode 100644 index 000000000000..89fe0ab526a8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/av-split-dead-valno-crash.ll @@ -0,0 +1,211 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -stress-regalloc=18 < %s | FileCheck %s + +define amdgpu_kernel void @vgpr_mfma_pass_av_split_crash(double %arg1, i1 %arg2, i1 %cond.i.i.i2295, ptr addrspace(1) %ptr, ptr %ptr1) #0 { +; CHECK-LABEL: vgpr_mfma_pass_av_split_crash: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 +; CHECK-NEXT: s_load_dwordx2 s[10:11], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x10 +; CHECK-NEXT: v_mov_b32_e32 v1, 0x3e21eeb6 +; CHECK-NEXT: v_mov_b32_e32 v2, 0xa17f65f6 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 +; CHECK-NEXT: s_xor_b64 s[18:19], s[16:17], -1 +; CHECK-NEXT: s_bitcmp1_b32 s0, 8 +; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: s_xor_b64 s[20:21], s[2:3], -1 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 +; CHECK-NEXT: s_and_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: v_mov_b32_e32 v0, 0x9037ab78 +; CHECK-NEXT: v_mov_b32_e32 v3, 0xbe927e4f +; CHECK-NEXT: v_mov_b32_e32 v4, 0x19f4ec90 +; CHECK-NEXT: v_mov_b32_e32 v5, 0x3efa01a0 +; CHECK-NEXT: v_mov_b32_e32 v6, 0x16c16967 +; CHECK-NEXT: v_mov_b32_e32 v7, 0xbf56c16c +; CHECK-NEXT: v_mov_b32_e32 v8, 0x69efb384 +; CHECK-NEXT: v_mov_b32_e32 v9, 0x3f4b2bb0 +; CHECK-NEXT: v_mov_b32_e32 v10, 0xa57d9582 +; CHECK-NEXT: v_mov_b32_e32 v11, 0xbf8c6ea4 +; CHECK-NEXT: v_mov_b32_e32 v12, 0xe82d3ff0 +; CHECK-NEXT: v_mov_b32_e32 v13, 0xbfa59976 +; CHECK-NEXT: v_mov_b32_e32 v14, 0x8427b883 +; CHECK-NEXT: v_mov_b32_e32 v15, 0x3fae1bb4 +; CHECK-NEXT: s_mov_b64 s[22:23], 0 +; CHECK-NEXT: v_mov_b32_e32 v16, 0x57b87036 +; CHECK-NEXT: v_mov_b32_e32 v17, 0x3fb3b136 +; CHECK-NEXT: s_and_b64 s[4:5], exec, s[16:17] +; CHECK-NEXT: v_mov_b32_e32 v18, 0x55555523 +; CHECK-NEXT: v_mov_b32_e32 v19, 0xbfd55555 +; CHECK-NEXT: s_and_b64 s[6:7], exec, s[18:19] +; CHECK-NEXT: v_mov_b32_e32 v20, 0 +; CHECK-NEXT: ; implicit-def: $agpr0_agpr1 +; CHECK-NEXT: ; implicit-def: $vgpr22_vgpr23 +; CHECK-NEXT: s_branch .LBB0_2 +; CHECK-NEXT: .LBB0_1: ; %Flow9 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_andn2_b64 vcc, exec, s[24:25] +; CHECK-NEXT: s_cbranch_vccz .LBB0_17 +; CHECK-NEXT: .LBB0_2: ; %._crit_edge1942.i.i.i3548 +; CHECK-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NEXT: ; Child Loop BB0_6 Depth 2 +; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_9 +; CHECK-NEXT: ; %bb.3: ; %.preheader1868.i.i.i3244 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_mov_b64 vcc, s[4:5] +; CHECK-NEXT: s_cbranch_vccz .LBB0_10 +; CHECK-NEXT: ; %bb.4: ; %.preheader1855.i.i.i3329.preheader +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[14:15] +; CHECK-NEXT: flat_load_dwordx2 v[24:25], v[24:25] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[0:1] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[2:3] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[16:17] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[24:25] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[4:5] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[6:7] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[8:9] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[10:11] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[12:13] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[14:15] +; CHECK-NEXT: v_fmac_f64_e32 v[28:29], 0, v[26:27] +; CHECK-NEXT: v_fmac_f64_e32 v[30:31], 0, v[28:29] +; CHECK-NEXT: v_mov_b64_e32 v[26:27], v[18:19] +; CHECK-NEXT: v_fmac_f64_e32 v[26:27], 0, v[30:31] +; CHECK-NEXT: v_mov_b64_e32 v[30:31], 0 +; CHECK-NEXT: s_branch .LBB0_6 +; CHECK-NEXT: .LBB0_5: ; %Flow +; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 +; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_cbranch_vccnz .LBB0_11 +; CHECK-NEXT: .LBB0_6: ; %.preheader1855.i.i.i3329 +; CHECK-NEXT: ; Parent Loop BB0_2 Depth=1 +; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: v_mov_b64_e32 v[28:29], v[30:31] +; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_mov_b64 vcc, s[2:3] +; CHECK-NEXT: ; implicit-def: $vgpr30_vgpr31 +; CHECK-NEXT: s_cbranch_vccz .LBB0_5 +; CHECK-NEXT: ; %bb.7: ; %.lr.ph2070.i.i.i3291 +; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 +; CHECK-NEXT: v_accvgpr_read_b32 v31, a1 +; CHECK-NEXT: v_accvgpr_read_b32 v30, a0 +; CHECK-NEXT: s_mov_b64 s[8:9], s[18:19] +; CHECK-NEXT: s_mov_b64 vcc, s[6:7] +; CHECK-NEXT: s_cbranch_vccz .LBB0_5 +; CHECK-NEXT: ; %bb.8: ; %.preheader1856.preheader.i.i.i3325 +; CHECK-NEXT: ; in Loop: Header=BB0_6 Depth=2 +; CHECK-NEXT: s_mov_b64 s[24:25], 0 +; CHECK-NEXT: v_mov_b64_e32 v[30:31], v[26:27] +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: s_branch .LBB0_5 +; CHECK-NEXT: .LBB0_9: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_mov_b64_e32 v[24:25], s[10:11] +; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 +; CHECK-NEXT: s_mov_b64 s[22:23], 0 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v25 +; CHECK-NEXT: s_mov_b64 s[8:9], s[20:21] +; CHECK-NEXT: s_branch .LBB0_15 +; CHECK-NEXT: .LBB0_10: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 +; CHECK-NEXT: s_branch .LBB0_15 +; CHECK-NEXT: .LBB0_11: ; %loop.exit.guard +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[24:25] +; CHECK-NEXT: s_cbranch_vccz .LBB0_13 +; CHECK-NEXT: ; %bb.12: ; %._crit_edge2105.i.i.i2330.loopexit +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_cmp_nlg_f64_e64 s[8:9], 0, v[28:29] +; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 +; CHECK-NEXT: v_cndmask_b32_e64 v23, v23, 0, s[16:17] +; CHECK-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[8:9] +; CHECK-NEXT: v_mov_b32_e32 v27, v26 +; CHECK-NEXT: s_and_b64 s[8:9], exec, s[16:17] +; CHECK-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[16:17] +; CHECK-NEXT: global_store_dwordx2 v20, v[26:27], s[12:13] +; CHECK-NEXT: s_cselect_b32 s23, s23, 0 +; CHECK-NEXT: s_cselect_b32 s22, s22, 0 +; CHECK-NEXT: s_mov_b64 s[8:9], -1 +; CHECK-NEXT: s_branch .LBB0_14 +; CHECK-NEXT: .LBB0_13: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_accvgpr_write_b32 a0, v24 +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: v_mov_b64_e32 v[22:23], 0 +; CHECK-NEXT: .LBB0_14: ; %Flow6 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_accvgpr_write_b32 a1, v25 +; CHECK-NEXT: .LBB0_15: ; %Flow6 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: s_mov_b64 s[24:25], -1 +; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] +; CHECK-NEXT: s_cbranch_vccz .LBB0_1 +; CHECK-NEXT: ; %bb.16: ; %._crit_edge2105.i.i.i2330 +; CHECK-NEXT: ; in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: v_mov_b32_e32 v21, v20 +; CHECK-NEXT: s_mov_b64 s[24:25], 0 +; CHECK-NEXT: global_store_dwordx2 v20, v[20:21], s[12:13] +; CHECK-NEXT: s_branch .LBB0_1 +; CHECK-NEXT: .LBB0_17: ; %DummyReturnBlock +; CHECK-NEXT: s_endpgm +entry: + br label %._crit_edge1942.i.i.i3548 + +._crit_edge1942.i.i.i3548: ; preds = %._crit_edge2105.i.i.i2330, %entry + %.sroa.02591.0.i.i.i226323 = phi double [ poison, %entry ], [ %.sroa.02591.3.i.i.i2301, %._crit_edge2105.i.i.i2330 ] + %.sroa.3.0.i.i.i2270 = phi double [ poison, %entry ], [ %.sroa.3.3.i.i.i2308, %._crit_edge2105.i.i.i2330 ] + %.014942244.i.i.i2280 = phi double [ 0.000000e+00, %entry ], [ %.31497.i.i.i2317, %._crit_edge2105.i.i.i2330 ] + br i1 %cond.i.i.i2295, label %.preheader1868.i.i.i3244, label %._crit_edge2105.i.i.i2330 + +.preheader1868.i.i.i3244: ; preds = %._crit_edge1942.i.i.i3548 + %i = load double, ptr %ptr1, align 8 + %i3 = call double @llvm.fma.f64(double %i, double 0.000000e+00, double 0x3E21EEB69037AB78) + %i4 = call double @llvm.fma.f64(double 0.000000e+00, double %i3, double 0xBE927E4FA17F65F6) + %i5 = call double @llvm.fma.f64(double 0.000000e+00, double %i4, double 0x3EFA01A019F4EC90) + %i6 = call double @llvm.fma.f64(double 0.000000e+00, double %i5, double 0xBF56C16C16C16967) + %spec.select.i.i.i3288 = select i1 %arg2, double 0.000000e+00, double %.sroa.3.0.i.i.i2270 + br i1 %arg2, label %.preheader1855.i.i.i3329, label %._crit_edge2105.i.i.i2330 + +.lr.ph2070.i.i.i3291: ; preds = %.preheader1855.i.i.i3329 + br i1 %arg2, label %.preheader1855.i.i.i3329, label %.preheader1856.preheader.i.i.i3325 + +.preheader1856.preheader.i.i.i3325: ; preds = %.lr.ph2070.i.i.i3291 + %i11 = call double @llvm.fma.f64(double 0.000000e+00, double %i6, double 0x3F4B2BB069EFB384) + %i14 = call double @llvm.fma.f64(double 0.000000e+00, double %i11, double 0xBF8C6EA4A57D9582) + %i18 = call double @llvm.fma.f64(double 0.000000e+00, double %i14, double 0xBFA59976E82D3FF0) + %i21 = call double @llvm.fma.f64(double 0.000000e+00, double %i18, double 0x3FAE1BB48427B883) + %i23 = call double @llvm.fma.f64(double 0.000000e+00, double %i21, double 0x3FB3B13657B87036) + %i28 = call double @llvm.fma.f64(double 0.000000e+00, double %i23, double 0xBFD5555555555523) + br label %.preheader1855.i.i.i3329 + +.preheader1855.i.i.i3329: ; preds = %.preheader1856.preheader.i.i.i3325, %.lr.ph2070.i.i.i3291, %.preheader1868.i.i.i3244 + %.sroa.02591.4.i.i.i3335 = phi double [ %i28, %.preheader1856.preheader.i.i.i3325 ], [ %.sroa.02591.0.i.i.i226323, %.lr.ph2070.i.i.i3291 ], [ 0.000000e+00, %.preheader1868.i.i.i3244 ] + %.21496.ph.i.i.i3348 = select i1 %arg2, double %.014942244.i.i.i2280, double 0.000000e+00 + %i31 = fcmp one double %.sroa.02591.4.i.i.i3335, 0.000000e+00 + %i32 = select i1 %i31, <2 x i32> zeroinitializer, <2 x i32> splat (i32 1) + store <2 x i32> %i32, ptr addrspace(1) %ptr, align 8 + br i1 %cond.i.i.i2295, label %.lr.ph2070.i.i.i3291, label %._crit_edge2105.i.i.i2330 + +._crit_edge2105.i.i.i2330: ; preds = %.preheader1855.i.i.i3329, %.preheader1868.i.i.i3244, %._crit_edge1942.i.i.i3548 + %.sroa.02591.3.i.i.i2301 = phi double [ %.sroa.02591.0.i.i.i226323, %.preheader1868.i.i.i3244 ], [ %arg1, %._crit_edge1942.i.i.i3548 ], [ %i, %.preheader1855.i.i.i3329 ] + %.sroa.3.3.i.i.i2308 = phi double [ 0.000000e+00, %.preheader1868.i.i.i3244 ], [ %.sroa.3.0.i.i.i2270, %._crit_edge1942.i.i.i3548 ], [ %spec.select.i.i.i3288, %.preheader1855.i.i.i3329 ] + %.31497.i.i.i2317 = phi double [ %.014942244.i.i.i2280, %.preheader1868.i.i.i3244 ], [ 0.000000e+00, %._crit_edge1942.i.i.i3548 ], [ %.21496.ph.i.i.i3348, %.preheader1855.i.i.i3329 ] + store double 0.000000e+00, ptr addrspace(1) %ptr, align 8 + br label %._crit_edge1942.i.i.i3548 +} + +declare double @llvm.fma.f64(double, double, double) #1 + +attributes #0 = { "amdgpu-waves-per-eu"="8,8" "target-cpu"="gfx942" } +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 505ddc8c3b57..10e523d1a0cf 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -37774,9 +37774,10 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) { ; GFX11TRUE16-LABEL: v_uitofp_i16_to_bf16: ; GFX11TRUE16: ; %bb.0: ; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v0 +; GFX11TRUE16-NEXT: v_cvt_f32_u32_e32 v0, v1 ; GFX11TRUE16-NEXT: v_bfe_u32 v1, v0, 16, 1 ; GFX11TRUE16-NEXT: v_or_b32_e32 v2, 0x400000, v0 ; GFX11TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 @@ -40750,12 +40751,11 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) { ; ; GFX11TRUE16-LABEL: s_select_bf16: ; GFX11TRUE16: ; %bb.0: +; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v0.l, vcc_lo -; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, s1, v1.l, vcc_lo ; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11TRUE16-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll index f4b432dce8c8..0ceb9019eb99 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll @@ -3443,15 +3443,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3569,14 +3568,13 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3884,15 +3882,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -4007,14 +4004,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -4328,15 +4324,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX12-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 @@ -4556,15 +4551,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; Child Loop BB15_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, v4, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v6.l, v6.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v6, v4, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v6, v7, v11, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6 ; GFX11-TRUE16-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll index 6f1675edbe58..cad4c39eaf39 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll @@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2640,20 +2640,19 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2973,16 +2972,16 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3098,20 +3097,19 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3437,16 +3435,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3672,16 +3670,16 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll index acb27be1846b..6275afd2c699 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll @@ -2512,16 +2512,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -2640,20 +2640,19 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -2973,16 +2972,16 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v1.l, v1.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 ; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX12-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 @@ -3098,20 +3097,19 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen ; GFX11-TRUE16-NEXT: s_not_b32 s6, s5 ; GFX11-TRUE16-NEXT: s_mov_b32 s5, 0 -; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB11_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, s4, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, s4, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, v2, s6, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; GFX11-TRUE16-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) @@ -3437,16 +3435,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX12-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v4.h, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX12-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX12-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 @@ -3672,16 +3670,16 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu ; GFX11-TRUE16-NEXT: ; Child Loop BB12_4 Depth 2 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v9, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, exec_lo ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v4.h, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v9, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v11, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v7, v5 ; GFX11-TRUE16-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1 ; GFX11-TRUE16-NEXT: ; => This Inner Loop Header: Depth=2 diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll index ff80250bfc88..2db7b28c7de9 100644 --- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll +++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll @@ -2745,6 +2745,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; ; GFX11-TRUE16-LABEL: amdgpu_cs_v32i1: ; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_and_b16 v26.l, v26.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v20.h, v22.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v17.h, v18.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1 ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, v10.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 1, v9.l ; GFX11-TRUE16-NEXT: v_and_b16 v8.l, v8.l, 1 @@ -2754,6 +2763,18 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v2.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 1, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 1 +; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 3, v27.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v26.l, 2, v26.l +; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v24.l, v25.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 3, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v20.h, 2, v20.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.h, 3, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 2, v17.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.l, v16.l, v17.l ; GFX11-TRUE16-NEXT: v_and_b16 v14.l, v14.l, 1 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 1, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, v12.l, 1 @@ -2766,15 +2787,15 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 3, v3.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 2, v1.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v25.h, v26.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.l, 1, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v22.l, v22.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v21.l, 1, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v20.l, v20.l, 1 -; GFX11-TRUE16-NEXT: v_and_b16 v18.l, v18.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 1, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 1 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 3, v31.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.h, 2, v30.l +; GFX11-TRUE16-NEXT: v_or_b16 v28.l, v28.l, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v25.h, v25.h, v26.l +; GFX11-TRUE16-NEXT: v_and_b16 v21.h, v22.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v18.l, v22.l, v20.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.h, v16.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v17.h, v18.h, v17.h +; GFX11-TRUE16-NEXT: v_and_b16 v16.l, v16.l, 3 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 3, v15.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.h, 2, v14.l ; GFX11-TRUE16-NEXT: v_or_b16 v8.h, v12.l, v13.l @@ -2784,65 +2805,42 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) { ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v0.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v2.h, v1.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 3 -; GFX11-TRUE16-NEXT: v_and_b16 v30.l, v30.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v29.l, 1, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v28.l, v28.l, 1 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.h, 3, v27.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v25.h, 2, v25.h -; GFX11-TRUE16-NEXT: v_or_b16 v24.l, v24.l, v25.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v23.l, 3, v23.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v22.l, 2, v22.l -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v20.l, v21.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.h, 3, v19.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v18.l, 2, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v26.h, v28.h, v29.h +; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v28.l, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v19.l, v21.h, v25.h +; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v16.h, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v15.h, v16.l, v17.h ; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v12.h, v10.h ; GFX11-TRUE16-NEXT: v_and_b16 v3.l, v8.h, 3 ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v6.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v30.h, 3, v31.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v28.h, 2, v30.l -; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v28.l, v29.l -; GFX11-TRUE16-NEXT: v_or_b16 v22.h, v22.h, v25.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.l, v24.l, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v18.h, v23.l, v22.l -; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v16.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v16.h, v17.h, v18.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 3 +; GFX11-TRUE16-NEXT: v_or_b16 v24.h, v24.h, v26.h +; GFX11-TRUE16-NEXT: v_and_b16 v14.h, v19.l, 15 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v16.h, 4, v16.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v15.h, 15 ; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v3.l, v2.h ; GFX11-TRUE16-NEXT: v_and_b16 v1.l, v1.l, 15 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 4, v0.h ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, v0.l, 15 -; GFX11-TRUE16-NEXT: v_or_b16 v28.h, v30.h, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v24.h, v24.h, 3 -; GFX11-TRUE16-NEXT: v_or_b16 v20.h, v24.l, v22.h -; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v14.h, v18.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v17.l, 12, v24.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v14.h ; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v16.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v2.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v23.h, v24.h, v28.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, v20.h, 15 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 4, v2.h -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, v1.h, 15 +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v17.l, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v1.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v2.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 12, v23.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v2.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll index b9caf8e80bcd..ccdc0b1bf43c 100644 --- a/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -1561,10 +1561,10 @@ define amdgpu_kernel void @v_no_clamp_add_src_v2f16_f16_src(ptr addrspace(1) %ou ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_pk_max_f16 v0, v0, v0 clamp ; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll index b5bc09a1684e..26f204f29f5a 100644 --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -946,9 +946,9 @@ define double @v_uitofp_i8_to_f64(i8 %arg0) nounwind { ; GFX11-TRUE16-LABEL: v_uitofp_i8_to_f64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1770,40 +1770,38 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o ; GFX11-TRUE16-LABEL: load_v4i8_to_v4f32_2_uses: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.h ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1] ; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, 9 -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff00, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.h -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.h, v4.h, 9 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, 0x900, v0.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff00, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff00, v4.h +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte3_e32 v3, v4 +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte1_e32 v1, v4 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_add_nc_u16 v6.h, 0x900, v0.l -; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte2_e32 v2, v4 +; GFX11-TRUE16-NEXT: v_add_nc_u16 v5.l, 0x900, v0.l +; GFX11-TRUE16-NEXT: v_add_nc_u16 v7.h, 0x900, v0.h ; GFX11-TRUE16-NEXT: v_cvt_f32_ubyte0_e32 v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v7, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v5, v7 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v5, v[0:3], s[0:1] -; GFX11-TRUE16-NEXT: global_store_b32 v5, v4, s[2:3] +; GFX11-TRUE16-NEXT: global_store_b128 v6, v[0:3], s[0:1] +; GFX11-TRUE16-NEXT: global_store_b32 v6, v4, s[2:3] ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: load_v4i8_to_v4f32_2_uses: diff --git a/llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll new file mode 100644 index 000000000000..75aaec6f1fa7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/disable-preload-kernargs.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=amdgpu-preload-kernel-arguments -amdgpu-kernarg-preload=0 %s -o - | FileCheck -check-prefix=NO-PRELOAD %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes=amdgpu-preload-kernel-arguments %s -o - | FileCheck -check-prefix=DEFAULT-PRELOAD %s + +@g1 = protected addrspace(1) externally_initialized global i16 0, align 2 + +define amdgpu_kernel void @test_kernel_with_zero_kernel_arg() { +; NO-PRELOAD-LABEL: define amdgpu_kernel void @test_kernel_with_zero_kernel_arg( +; NO-PRELOAD-SAME: ) #[[ATTR0:[0-9]+]] { +; NO-PRELOAD-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; NO-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 +; NO-PRELOAD-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; NO-PRELOAD-NEXT: store i16 [[GROUP_SIZE_X]], ptr addrspace(1) @g1, align 2 +; NO-PRELOAD-NEXT: ret void +; +; DEFAULT-PRELOAD-LABEL: define amdgpu_kernel void @test_kernel_with_zero_kernel_arg( +; DEFAULT-PRELOAD-SAME: i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_X:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Y:%.*]], i32 inreg "amdgpu-hidden-argument" [[_HIDDEN_BLOCK_COUNT_Z:%.*]], i16 inreg "amdgpu-hidden-argument" [[_HIDDEN_GROUP_SIZE_X:%.*]]) #[[ATTR0:[0-9]+]] { +; DEFAULT-PRELOAD-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; DEFAULT-PRELOAD-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 12 +; DEFAULT-PRELOAD-NEXT: [[GROUP_SIZE_X:%.*]] = load i16, ptr addrspace(4) [[GEP]], align 2 +; DEFAULT-PRELOAD-NEXT: store i16 [[_HIDDEN_GROUP_SIZE_X]], ptr addrspace(1) @g1, align 2 +; DEFAULT-PRELOAD-NEXT: ret void +; + %implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() + %gep = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 12 + %group_size_x = load i16, ptr addrspace(4) %gep + store i16 %group_size_x, ptr addrspace(1) @g1 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll index b0439b1f7968..c5db7a33f70e 100644 --- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll @@ -2536,12 +2536,13 @@ define void @test_dynamic_stackalloc_device_divergent_non_standard_size_i16(i16 ; GFX11-SDAG-LABEL: test_dynamic_stackalloc_device_divergent_non_standard_size_i16: ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-SDAG-NEXT: v_mov_b16_e32 v1.l, v0.l ; GFX11-SDAG-NEXT: s_mov_b32 s4, s33 ; GFX11-SDAG-NEXT: s_mov_b32 s1, exec_lo ; GFX11-SDAG-NEXT: s_mov_b32 s0, 0 ; GFX11-SDAG-NEXT: s_mov_b32 s33, s32 -; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, 15 +; GFX11-SDAG-NEXT: v_lshl_add_u32 v0, v1, 2, 15 ; GFX11-SDAG-NEXT: s_add_i32 s32, s32, 16 ; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 8c7d5cffe39d..22dd66118837 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -8410,13 +8410,12 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8529,13 +8528,12 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -8785,13 +8783,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8908,13 +8905,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9171,13 +9167,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9295,13 +9290,12 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9557,11 +9551,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9671,11 +9665,11 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -9917,11 +9911,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10035,11 +10029,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -10288,11 +10282,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10407,11 +10401,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -10651,8 +10645,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10735,8 +10729,8 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -10925,10 +10919,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -11014,10 +11007,9 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -11220,13 +11212,12 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -11345,13 +11336,12 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -11610,11 +11600,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -11730,11 +11720,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll index 56ad91dd59ff..1dc45179c74c 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll @@ -6043,14 +6043,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6168,14 +6168,14 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6438,14 +6438,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6570,14 +6570,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -6847,14 +6847,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6980,14 +6980,14 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -7254,13 +7254,12 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7376,13 +7375,12 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7638,13 +7636,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7767,13 +7764,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8036,13 +8032,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8166,13 +8161,12 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8424,11 +8418,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8519,11 +8513,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8728,10 +8722,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8820,10 +8813,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -9035,14 +9027,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9169,14 +9161,14 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -9448,13 +9440,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9579,13 +9570,12 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll index f0083bd23660..5d26293e7009 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll @@ -6043,14 +6043,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6168,14 +6168,14 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6438,14 +6438,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6570,14 +6570,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -6847,14 +6847,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6980,14 +6980,14 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -7254,13 +7254,12 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7376,13 +7375,12 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7638,13 +7636,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7767,13 +7764,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8036,13 +8032,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8166,13 +8161,12 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -8424,11 +8418,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8519,11 +8513,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8728,10 +8722,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8820,10 +8813,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -9035,14 +9027,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9169,14 +9161,14 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc @@ -9448,13 +9440,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9579,13 +9570,12 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[3:4], v[5:6] glc diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll index 3ee0bb2122ab..d12a7f973158 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll @@ -5855,13 +5855,12 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5974,13 +5973,12 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6230,13 +6228,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6353,13 +6350,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -6616,13 +6612,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6740,13 +6735,12 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -7002,11 +6996,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7116,11 +7110,11 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7362,11 +7356,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7480,11 +7474,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -7733,11 +7727,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7852,11 +7846,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc @@ -8096,10 +8090,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8185,10 +8178,9 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8382,8 +8374,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8466,8 +8458,8 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc @@ -8665,13 +8657,12 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8790,13 +8781,12 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc @@ -9055,11 +9045,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -9175,11 +9165,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll index 9c4901eb19f3..899cc8940544 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll @@ -4238,7 +4238,7 @@ define amdgpu_ps i32 @s_mul_32_f16(half inreg %x, half inreg %y) { ; GFX11-GISEL-TRUE16-LABEL: s_mul_32_f16: ; GFX11-GISEL-TRUE16: ; %bb.0: ; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e64 v0.l, 0x5000, s0 -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 ; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog ; diff --git a/llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll b/llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll new file mode 100644 index 000000000000..66a8b424b576 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fncall-implicitdef.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -O1 %s -o - | FileCheck %s + +define amdgpu_ps <4 x float> @caller(ptr %ptr) { +; CHECK-LABEL: caller: +; CHECK: ; %bb.0: +; CHECK-NEXT: flat_load_dword v1, v[0:1] +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_mov_b32 s2, 0 +; CHECK-NEXT: s_mov_b32 s5, fn@abs32@hi +; CHECK-NEXT: s_mov_b32 s4, fn@abs32@lo +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s3, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_swappc_b64 s[30:31], s[4:5] +; CHECK-NEXT: ; return to shader part epilog + %L = load i32, ptr %ptr, align 4 + %R = call <4 x float> @fn(<4 x i32> zeroinitializer, i32 0, i32 %L, i32 0) + ret <4 x float> %R +} + +declare hidden <4 x float> @fn(<4 x i32> inreg, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll index f09c25767648..a859cc91b7fd 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll @@ -644,11 +644,10 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind { ; GFX11-TRUE16-LABEL: fmul_pow_mul_max_pow2: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 2 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], 0x40080000, v[0:1] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1194,13 +1193,12 @@ define double @fmul_pow_shl_cnt_safe(i16 %cnt) nounwind { ; GFX11-TRUE16-LABEL: fmul_pow_shl_cnt_safe: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v0.l, 1 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0xff5f3992 ; GFX11-TRUE16-NEXT: s_mov_b32 s1, 0x7befffff ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: v_cvt_f64_u32_e32 v[0:1], v0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_mul_f64 v[0:1], v[0:1], s[0:1] ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index c52fb6197e3e..40d276539554 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -4372,14 +4372,13 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32( ; GFX11-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, s2 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; @@ -4607,14 +4606,13 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32( ; GFX11-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32: ; GFX11-GISEL-TRUE16: ; %bb.0: ; %entry ; GFX11-GISEL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_cvt_f16_f32_e64 v0.l, |s2| ; GFX11-GISEL-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 95e28a37f5ee..3c41cc43a089 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -1107,21 +1107,19 @@ define void @void_func_v4i8(<4 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v4i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1190,22 +1188,20 @@ define void @void_func_v5i8(<5 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v5i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 4 +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: buffer_store_b8 v4, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v3, v2 ; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1285,29 +1281,27 @@ define void @void_func_v8i8(<8 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v8i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v5.h, v4.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v0.h, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v4, v6 +; GFX11-TRUE16-NEXT: v_or_b16 v6.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v6.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 -; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v3 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v6 ; GFX11-TRUE16-NEXT: buffer_store_b64 v[1:2], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1422,47 +1416,44 @@ define void @void_func_v16i8(<16 x i8> %arg0) #0 { ; GFX11-TRUE16-LABEL: void_func_v16i8: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v12.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v13.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v13.l ; GFX11-TRUE16-NEXT: v_and_b16 v12.l, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v13.l, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.l, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v12.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v13.h, v12.h ; GFX11-TRUE16-NEXT: v_and_b16 v10.l, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v8.h, v13.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v8.l, v9.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v12.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v12.l, v13.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v9.l, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v6.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v12, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v10.l, v9.h +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v10.h, v9.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.l, v8.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v5.h, v4.h +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v14.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.l, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v9, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v10.l, v8.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v11 -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v8, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v4, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v14 +; GFX11-TRUE16-NEXT: v_or_b16 v14.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v14.l ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v12 -; GFX11-TRUE16-NEXT: buffer_store_b128 v[6:9], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v0, v14 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[5:8], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: void_func_v16i8: @@ -1658,83 +1649,77 @@ define void @void_func_v32i8(<32 x i8> %arg0) #0 { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: scratch_load_d16_u8 v31, off, s32 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v32.l, 0 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, v3.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.h, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.h, 8, v11.l ; GFX11-TRUE16-NEXT: v_and_b16 v5.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v7.h, v6.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v8.l ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v4.l, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v13 +; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v32.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v29.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.l, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v12, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v13, v32 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v3.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v7.h, v6.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v12, v32 ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.h, v4.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v11.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_and_b16 v11.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v9.h, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v13, 0xffff, v14 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v12, v32 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.h, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v10.l, v4.l, v5.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v9, v32 ; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v6.l, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v9.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.h, 8, v21.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v11.h, v11.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v5 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v13, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v8.h, v8.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v10.l, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b16 v10.h, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_or_b16 v6.l, v0.h, v6.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v8.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v11.h, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v29.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v28.l +; GFX11-TRUE16-NEXT: v_or_b16 v11.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v10, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v0.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_and_b16 v5.l, 0xff, v26.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v6.l, 8, v25.l +; GFX11-TRUE16-NEXT: v_and_b16 v7.h, 0xff, v24.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v6.h, v5.h ; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v11, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v4.h, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v11, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v14.h, v32.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.h, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v6.h, 0xff, v20.l +; GFX11-TRUE16-NEXT: v_or_b16 v14.l, v7.h, v6.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v15.h, v32.l +; GFX11-TRUE16-NEXT: v_and_b16 v8.h, 0xff, v16.l ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 16 +; GFX11-TRUE16-NEXT: v_or_b16 v15.l, v6.h, v5.h ; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 ; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v31.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v9.l, v5.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v8.l, 8, v19.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v7, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v10.h, v10.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v4.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v9, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v5.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v10 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v11, v32 -; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.l, v8.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v31.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v7.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v13, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v19.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v5.l, 8, v17.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v14, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v8.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v8.h, v5.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v15, v32 +; GFX11-TRUE16-NEXT: v_or_b16 v32.h, v4.h, v4.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v9, v32 ; GFX11-TRUE16-NEXT: buffer_store_b128 v[4:7], off, s[0:3], 0 ; GFX11-TRUE16-NEXT: s_mov_b64 s[0:1], 0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll index 2fdc1a885486..919464a93674 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -4896,23 +4896,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 ; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v0, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 @@ -5156,30 +5155,29 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v5 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v6 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v3.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_or_b16 v2.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_or_b16 v3.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 4 -; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: global_store_b8 v[0:1], v4, off ; GFX11-TRUE16-NEXT: global_store_b32 v[40:41], v2, off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 +; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5441,36 +5439,34 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 24, v1 ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v5.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.l, 0 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v7.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v4 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v5.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v4.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v3, v5 -; GFX11-TRUE16-NEXT: v_or_b16 v5.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v4 +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX11-TRUE16-NEXT: v_or_b16 v4.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v4.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v4 ; GFX11-TRUE16-NEXT: v_readlane_b32 s31, v42, 1 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v0, v5 ; GFX11-TRUE16-NEXT: v_readlane_b32 s30, v42, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 -; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: global_store_b64 v[40:41], v[1:2], off ; GFX11-TRUE16-NEXT: s_clause 0x1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v41, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-TRUE16-NEXT: s_mov_b32 s32, s33 +; GFX11-TRUE16-NEXT: v_readlane_b32 s0, v42, 2 ; GFX11-TRUE16-NEXT: s_or_saveexec_b32 s1, -1 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload ; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s1 @@ -5910,85 +5906,77 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 { ; GFX11-TRUE16-NEXT: v_dual_mov_b32 v17, v32 :: v_dual_mov_b32 v18, v33 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v19, v34 ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v13.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v12.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v15.l -; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.l, 0 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v15.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v14.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v13.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v12.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v12.l, 0 ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v12.l, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v14.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.h, 8, v9.l -; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v9, 0xffff, v12 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v0.h, v2.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b16 v8.l, v3.h, v1.h +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l +; GFX11-TRUE16-NEXT: v_or_b16 v13.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v13.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v9.l +; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v9.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v13, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l +; GFX11-TRUE16-NEXT: v_or_b16 v9.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v2.h, 8, v5.l ; GFX11-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v4.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v11.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v10.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v9, v13 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v8, 0xffff, v8 -; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v7.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v4, v8, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.h, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v9, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v3.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v29.l +; GFX11-TRUE16-NEXT: v_or_b16 v4.l, v3.h, v2.h +; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v12.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v4, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v1.h, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v31.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v30.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v29.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v28.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v6, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h +; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, v2, v12 +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v4.l, 8, v17.l +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v27.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v26.l ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v25.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v24.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v31.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v30.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v13 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v26.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v27.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v7.l, v0.l -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v21.l +; GFX11-TRUE16-NEXT: v_and_b16 v4.h, 0xff, v16.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, v2, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v23.l +; GFX11-TRUE16-NEXT: v_or_b16 v2.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v22.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v21.l ; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v20.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v9, v6, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v17.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v16.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffff, v7 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v23.l -; GFX11-TRUE16-NEXT: v_and_b16 v6.l, 0xff, v22.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l -; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v1.h, v1.l -; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, v7, v13 -; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v18.l -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v6.l, v0.h -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v19.l -; GFX11-TRUE16-NEXT: v_mov_b16_e32 v10.l, v0.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, v6, v13 -; GFX11-TRUE16-NEXT: v_or_b16 v13.h, v1.l, v0.h -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v10 -; GFX11-TRUE16-NEXT: v_or_b32_e32 v6, v0, v13 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_or_b32_e32 v2, v2, v12 +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v19.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_or_b16 v1.l, v1.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, v12.l +; GFX11-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v18.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_or_b16 v12.h, v0.h, v0.l +; GFX11-TRUE16-NEXT: v_or_b16 v0.l, v4.h, v4.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v12.l +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v12 ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[6:9], off -; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[2:5], off +; GFX11-TRUE16-NEXT: global_store_b128 v[42:43], v[0:3], off +; GFX11-TRUE16-NEXT: global_store_b128 v[40:41], v[5:8], off ; GFX11-TRUE16-NEXT: s_clause 0x3 ; GFX11-TRUE16-NEXT: scratch_load_b32 v43, off, s33 ; GFX11-TRUE16-NEXT: scratch_load_b32 v42, off, s33 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll index 1f74fbdc46e9..9c1f9d21b9da 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll @@ -8275,13 +8275,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8394,13 +8393,12 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -8700,13 +8698,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8823,13 +8820,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -9138,13 +9134,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9262,13 +9257,12 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -9576,11 +9570,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -9690,11 +9684,11 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -9985,11 +9979,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10103,11 +10097,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -10406,11 +10400,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10525,11 +10519,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -10819,10 +10813,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -10908,10 +10901,9 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -11144,8 +11136,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -11228,8 +11220,8 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -11464,13 +11456,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -11589,13 +11580,12 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -11906,11 +11896,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -12026,11 +12016,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll index faa74fef2be2..f7cc0709109f 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll @@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5832,13 +5832,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5954,13 +5953,12 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6265,13 +6263,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6394,13 +6391,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -6713,13 +6709,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6843,13 +6838,12 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -7151,11 +7145,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7246,11 +7240,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7494,10 +7488,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7586,10 +7579,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7838,14 +7830,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -7972,14 +7964,14 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -8303,13 +8295,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8434,13 +8425,12 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll index a46b0129b79e..b81af1fc9233 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll @@ -4467,14 +4467,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -4592,14 +4592,14 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -4912,14 +4912,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5044,14 +5044,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5373,14 +5373,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5506,14 +5506,14 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -5832,13 +5832,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v3.l, v3.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5954,13 +5953,12 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v3.l, v3.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6265,13 +6263,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6394,13 +6391,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -6713,13 +6709,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6843,13 +6838,12 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -7151,11 +7145,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7246,11 +7240,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7494,10 +7488,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.h, v4.l, v4.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, v2.h, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7586,10 +7579,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.h, v4.l, v4.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, v2.h, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -7838,14 +7830,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -7972,14 +7964,14 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc @@ -8303,13 +8295,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v0.h, v5.l, v5.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v5.l, v0.h, v0.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8434,13 +8425,12 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_ ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v1, v6 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v5.l, v5.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v5.l, v0.h, v0.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v1, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v2, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[3:4], v[5:6], off glc diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll index 053efdcb7626..b8762d13e132 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll @@ -5221,13 +5221,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5340,13 +5339,12 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -5646,13 +5644,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -5769,13 +5766,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -6084,13 +6080,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6208,13 +6203,12 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -6522,11 +6516,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -6636,11 +6630,11 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -6931,11 +6925,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7049,11 +7043,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -7352,11 +7346,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7471,11 +7465,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc @@ -7765,10 +7759,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -7854,10 +7847,9 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -8090,8 +8082,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV @@ -8174,8 +8166,8 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v4.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc @@ -8410,13 +8402,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8535,13 +8526,12 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) % ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v6, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v6 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0 ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v5.l, v5.l, v2.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v5, 0xffff, v5 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v5, v6, v4, v5 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc @@ -8852,11 +8842,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX12-TRUE16-NEXT: global_wb scope:SCOPE_SYS ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 @@ -8972,11 +8962,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v5, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_sub_f16_e32 v3.l, v3.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v5, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v6, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir new file mode 100644 index 000000000000..170478539d8a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx1250.mir @@ -0,0 +1,549 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass=post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GCN %s + +--- +name: ds_atomic_async_barrier_arrive_b64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1 + ; GCN-LABEL: name: ds_atomic_async_barrier_arrive_b64 + ; GCN: liveins: $vgpr0, $vgpr1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT_DEPCTR 65507 + ; GCN-NEXT: DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec + ; GCN-NEXT: S_WAITCNT_DEPCTR 65507 + DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 $vgpr1, 0, 0, implicit-def $asynccnt, implicit $asynccnt, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s103_read_flat_scr_base_hi +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr103 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec + $sgpr103 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: write_s102_read_flat_scr_base + ; GCN: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base + $sgpr102 = S_MOV_B32 0 + $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base +... + +--- +name: write_s103_read_flat_scr_base +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: write_s103_read_flat_scr_base + ; GCN: $sgpr103 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base + $sgpr103 = S_MOV_B32 0 + $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base +... + +--- +name: write_s102_s103_read_flat_scr_base +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: write_s102_s103_read_flat_scr_base + ; GCN: $sgpr102_sgpr103 = S_MOV_B64 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base + $sgpr102_sgpr103 = S_MOV_B64 0 + $sgpr0_sgpr1 = S_MOV_B64 $src_flat_scratch_base +... + +--- +name: write_s102_getreg_flat_scr_base_lo +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_getreg_flat_scr_base_lo + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $sgpr1 = S_GETREG_B32 20, implicit $mode + $sgpr102 = S_MOV_B32 0 + $sgpr1 = S_GETREG_B32 20, implicit $mode +... + +--- +name: write_s103_getreg_flat_scr_base_hi +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s103_getreg_flat_scr_base_hi + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr103 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $sgpr1 = S_GETREG_B32 21, implicit $mode + $sgpr103 = S_MOV_B32 0 + $sgpr1 = S_GETREG_B32 21, implicit $mode +... + +--- +name: write_s102_s103_getreg_flat_scr_base_hi +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: write_s102_s103_getreg_flat_scr_base_hi + ; GCN: $sgpr102_sgpr103 = S_MOV_B64 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $sgpr1 = S_GETREG_B32 21, implicit $mode + $sgpr102_sgpr103 = S_MOV_B64 0 + $sgpr1 = S_GETREG_B32 21, implicit $mode +... + +--- +name: write_s102_read_flat_scr_base_lo_9_salu_valu +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_9_salu_valu + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2_sgpr3 = S_MOV_B64 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + ; NOP does not count because it does not write SGPRs + S_NOP 0 + ; DS_READ_B32 does not count because it is not SALU or VALU + $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + ; S_LOAD_DWORDX2_IMM does not count because it is not SALU + $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo_10_salu_valu_expired +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_10_salu_valu_expired + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: $sgpr10 = S_MOV_B32 0 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2_sgpr3 = S_MOV_B64 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + ; NOP does not count because it does not write SGPRs + S_NOP 0 + ; DS_READ_B32 does not count because it is not SALU or VALU + $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + ; S_LOAD_DWORDX2_IMM does not count because it is not SALU + $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + $sgpr10 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s103_read_flat_scr_base_hi_9_salu_valu +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi_9_salu_valu + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr103 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec + $sgpr103 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2_sgpr3 = S_MOV_B64 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + ; NOP does not count because it does not write SGPRs + S_NOP 0 + ; DS_READ_B32 does not count because it is not SALU or VALU + $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + ; S_LOAD_DWORDX2_IMM does not count because it is not SALU + $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec +... + +--- +name: write_s103_read_flat_scr_base_hi_10_salu_valu_expired +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s103_read_flat_scr_base_hi_10_salu_valu_expired + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr103 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr0 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2_sgpr3 = S_MOV_B64 0 + ; GCN-NEXT: $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + ; GCN-NEXT: $sgpr10 = S_MOV_B32 0 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec + $sgpr103 = S_MOV_B32 0 + $sgpr0 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2_sgpr3 = S_MOV_B64 0 + $vgpr1, $vcc_lo = V_ADDC_U32_e64 0, $vgpr0, $sgpr0, 0, implicit $exec + $vgpr2 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + ; NOP does not count because it does not write SGPRs + S_NOP 0 + ; DS_READ_B32 does not count because it is not SALU or VALU + $vgpr3 = DS_READ_B32 $vgpr0, 0, 0, implicit $m0, implicit $exec + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + ; S_LOAD_DWORDX2_IMM does not count because it is not SALU + $sgpr8_sgpr9 = S_LOAD_DWORDX2_IMM renamable $sgpr4_sgpr5, 0, 0 + $sgpr10 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_hi_no_hazard +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_hi_no_hazard + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo_expired_by_wait0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_expired_by_wait0 + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 0 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + S_WAITCNT_DEPCTR 0 + S_NOP 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo_expired_by_wait_vs_sdst_sa_sdst +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_expired_by_wait_vs_sdst_sa_sdst + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + S_WAITCNT_DEPCTR 61950 + S_NOP 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_va_sdst_only +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_va_sdst_only + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61951 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + S_WAITCNT_DEPCTR 61951 + S_NOP 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_sa_sdst_only +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_not_expired_by_wait_sa_sdst_only + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 65534 + ; GCN-NEXT: S_NOP 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + S_WAITCNT_DEPCTR 65534 + S_NOP 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: write_s102_write_s103_read_flat_scr_base_lo_read_flat_scr_base_hi +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GCN-LABEL: name: write_s102_write_s103_read_flat_scr_base_lo_read_flat_scr_base_hi + ; GCN: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr103 = S_MOV_B32 0 + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + ; GCN-NEXT: $vgpr1 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec + $sgpr102 = S_MOV_B32 0 + $sgpr103 = S_MOV_B32 0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $src_flat_scratch_base_hi, $vgpr0, implicit $exec +... + +--- +name: write_s102_read_flat_scr_base_lo_cross_blocks +tracksRegLiveness: true +body: | + ; GCN-LABEL: name: write_s102_read_flat_scr_base_lo_cross_blocks + ; GCN: bb.0: + ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; GCN-NEXT: liveins: $vgpr0, $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8 = S_MOV_B32 0 + ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc + ; GCN-NEXT: S_BRANCH %bb.1 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.1: + ; GCN-NEXT: successors: %bb.2(0x80000000) + ; GCN-NEXT: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: $sgpr102 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr1 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr2 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr3 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr4 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr5 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr6 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr7 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr8 = S_MOV_B32 0 + ; GCN-NEXT: $sgpr9 = S_MOV_B32 0 + ; GCN-NEXT: S_BRANCH %bb.2 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: bb.2: + ; GCN-NEXT: liveins: $vgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: S_WAITCNT_DEPCTR 61950 + ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec + bb.0: + liveins: $vgpr0, $sgpr0 + $sgpr102 = S_ADD_U32 $sgpr0, 0, implicit-def $scc + $sgpr1 = S_MOV_B32 0 + $sgpr2 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 0 + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + $sgpr8 = S_MOV_B32 0 + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1: + liveins: $vgpr0 + $sgpr102 = S_MOV_B32 0 + $sgpr1 = S_MOV_B32 0 + $sgpr2 = S_MOV_B32 0 + $sgpr3 = S_MOV_B32 0 + $sgpr4 = S_MOV_B32 0 + $sgpr5 = S_MOV_B32 0 + $sgpr6 = S_MOV_B32 0 + $sgpr7 = S_MOV_B32 0 + $sgpr8 = S_MOV_B32 0 + $sgpr9 = S_MOV_B32 0 + S_BRANCH %bb.2 + + bb.2: + liveins: $vgpr0 + $vgpr0 = V_ADD_U32_e32 $src_flat_scratch_base_lo, $vgpr0, implicit $exec +... + +--- +name: s_setreg_b32_hwreg_mode +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + ; GCN-LABEL: name: s_setreg_b32_hwreg_mode + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode + S_SETREG_B32 $sgpr0, 1, implicit-def $mode, implicit $mode +... + +--- +name: s_setreg_b32_mode +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0 + ; GCN-LABEL: name: s_setreg_b32_mode + ; GCN: liveins: $sgpr0 + ; GCN-NEXT: {{ $}} + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: S_SETREG_B32_mode $sgpr0, 1, implicit-def $mode, implicit $mode + S_SETREG_B32_mode $sgpr0, 1, implicit-def $mode, implicit $mode +... + +--- +name: s_setreg_imm32_b32_hwreg_mode +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: s_setreg_imm32_b32_hwreg_mode + ; GCN: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: S_SETREG_IMM32_B32 1, 1, implicit-def $mode, implicit $mode + S_SETREG_IMM32_B32 1, 1, implicit-def $mode, implicit $mode +... + +--- +name: s_setreg_imm32_b32_mode +tracksRegLiveness: true +body: | + bb.0: + ; GCN-LABEL: name: s_setreg_imm32_b32_mode + ; GCN: V_NOP_e32 implicit $exec + ; GCN-NEXT: V_NOP_e32 implicit $exec + ; GCN-NEXT: S_SETREG_IMM32_B32_mode 1, 1, implicit-def $mode, implicit $mode + S_SETREG_IMM32_B32_mode 1, 1, implicit-def $mode, implicit $mode +... diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll index 7ebd69204d87..305461ed6b20 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -1693,12 +1693,11 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302 -; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0 ; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_endpgm @@ -2724,32 +2723,32 @@ define amdgpu_kernel void @udot4_acc8_vecMul(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: global_load_d16_u8 v0, v5, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l -; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.l, 8, v4.l ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v3 +; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v4 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v0.h, 8, v3.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b16 v1.h, 8, v4.l ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v4.l, v0.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.l -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.l, v3.h, v4.h -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v1.h, v2.l, v6.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v2.l, v2.l, v6.l ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0 +; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.h, v0.h, v1.h ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h -; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.h -; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v0.h, v1.l +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v2.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.h, v6.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-DL-TRUE16-NEXT: v_lshlrev_b16 v7.l, 8, v0.h +; GFX11-DL-TRUE16-NEXT: v_or_b16 v6.h, v1.l, v2.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v1, v7, v6 ; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-DL-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.h, v4.h, v0.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-DL-TRUE16-NEXT: global_store_b8 v5, v0, s[4:5] ; GFX11-DL-TRUE16-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll index 742d87f099ce..31b6b533866d 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll @@ -1715,9 +1715,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -1745,8 +1745,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -1777,9 +1776,9 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v1.l, v0.l ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -1815,8 +1814,7 @@ define zeroext i16 @clpeak_umad_pat_i16(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16: @@ -9363,9 +9361,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: @@ -9409,8 +9407,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX11-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: @@ -9457,9 +9454,9 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.h, v0.l, v0.h, v0.l ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v0.l, v0.h -; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1200-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1200-SDAG-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v0.h, v0.l -; GFX1200-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1200-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-SDAG-FAKE16-LABEL: clpeak_umad_pat_i16_x2: @@ -9511,8 +9508,7 @@ define zeroext i16 @clpeak_umad_pat_i16_x2(i16 zeroext %x, i16 zeroext %y) { ; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l ; GFX1200-GISEL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v0.h -; GFX1200-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1200-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX1200-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX1200-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1200-GISEL-FAKE16-LABEL: clpeak_umad_pat_i16_x2: diff --git a/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir b/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir new file mode 100644 index 000000000000..18aeb2527b1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/livevars-implicitdef.mir @@ -0,0 +1,91 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn --run-pass=livevars -o - %s | FileCheck %s +--- +# Check that super register is defined for an sgpr copy. +name: sgpr_copy +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: sgpr_copy + ; CHECK: %sval:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: $sgpr0 = COPY %sval + ; CHECK-NEXT: $sgpr1 = COPY %sval + ; CHECK-NEXT: $sgpr2 = COPY %sval + ; CHECK-NEXT: $sgpr3 = COPY killed %sval + ; CHECK-NEXT: SI_RETURN implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + %sval:sreg_32 = S_MOV_B32 0 + + $sgpr0 = COPY %sval + $sgpr1 = COPY %sval + $sgpr2 = COPY %sval + $sgpr3 = COPY %sval + SI_RETURN implicit $sgpr0_sgpr1_sgpr2_sgpr3 + +... +--- +# Check that super register is defined for a vgpr vector copy. +name: vgpr_copy +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: vgpr_copy + ; CHECK: %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY %vval + ; CHECK-NEXT: $vgpr1 = COPY %vval + ; CHECK-NEXT: $vgpr2 = COPY %vval + ; CHECK-NEXT: $vgpr3 = COPY killed %vval + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3 + %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + $vgpr0 = COPY %vval + $vgpr1 = COPY %vval + $vgpr2 = COPY %vval + $vgpr3 = COPY %vval + %0:vgpr_32 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + +... +--- +# Check that super register is defined when there is a hole. +name: sgpr_copy_hole +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: sgpr_copy_hole + ; CHECK: %sval:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: $sgpr0 = COPY %sval + ; CHECK-NEXT: $sgpr2 = COPY %sval + ; CHECK-NEXT: $sgpr3 = COPY killed %sval + ; CHECK-NEXT: SI_RETURN implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + %sval:sreg_32 = S_MOV_B32 0 + + $sgpr0 = COPY %sval + $sgpr2 = COPY %sval + $sgpr3 = COPY %sval + SI_RETURN implicit $sgpr0_sgpr1_sgpr2_sgpr3 + +... +--- +# Check that super register is defined when a pair interrupts the sequence. +name: vgpr_copy_pair +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: vgpr_copy_pair + ; CHECK: %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: $vgpr0 = COPY %vval + ; CHECK-NEXT: $vgpr1 = COPY %vval + ; CHECK-NEXT: $vgpr2 = COPY %vval + ; CHECK-NEXT: $vgpr3 = COPY killed %vval + ; CHECK-NEXT: dead [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1_vgpr2 + ; CHECK-NEXT: dead [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3 + %vval:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + + $vgpr0 = COPY %vval + $vgpr1 = COPY %vval + $vgpr2 = COPY %vval + $vgpr3 = COPY %vval + %0:vgpr_32 = COPY $vgpr1_vgpr2 + %1:vgpr_32 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 +... diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index a42c71c4849b..c1a32aafbc71 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -1259,13 +1259,12 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1371,13 +1370,12 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1646,13 +1644,12 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1763,13 +1760,12 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, 4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -2044,13 +2040,12 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2153,13 +2148,12 @@ define void @local_atomic_fadd_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2419,11 +2413,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2531,11 +2525,11 @@ define void @local_atomic_fadd_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2795,10 +2789,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2882,10 +2875,9 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -3095,8 +3087,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -3177,8 +3169,8 @@ define void @local_atomic_fadd_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll index 8351d2805756..739e86d1928b 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll @@ -803,14 +803,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -918,14 +918,14 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1199,14 +1199,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1319,14 +1319,14 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1606,14 +1606,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1718,14 +1718,14 @@ define void @local_atomic_fmax_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1990,13 +1990,12 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2107,13 +2106,12 @@ define void @local_atomic_fmax_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2379,11 +2377,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2469,11 +2467,11 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2688,10 +2686,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2775,10 +2772,9 @@ define void @local_atomic_fmax_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll index 0c4aca88b378..6da80262951e 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll @@ -803,14 +803,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -918,14 +918,14 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1199,14 +1199,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v3.l, v3.l, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v3.l, 4.0, v3.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1319,14 +1319,14 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v3.l, v3.l, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v3.l, 4.0, v3.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -1606,14 +1606,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1718,14 +1718,14 @@ define void @local_atomic_fmin_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -1990,13 +1990,12 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v4.l, v4.l, v4.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v4.l, 4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2107,13 +2106,12 @@ define void @local_atomic_fmin_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v4.l, v4.l, v4.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v4.l, 4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2379,11 +2377,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v1.l, v2.l, v2.l ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v1.l, 4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2469,11 +2467,11 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v1.l, v2.l, v2.l ; GFX11-TRUE16-NEXT: v_min_f16_e32 v1.l, 4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -2688,10 +2686,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_max_num_f16_e32 v2.l, v1.l, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_min_num_f16_e32 v2.l, 4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -2775,10 +2772,9 @@ define void @local_atomic_fmin_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_f16_e32 v2.l, v1.l, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_min_f16_e32 v2.l, 4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll index 37310b614c0d..786989cc9fb5 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll @@ -1721,13 +1721,12 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -1833,13 +1832,12 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v0, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v0, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v1, v3, v4 @@ -2108,13 +2106,12 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -2225,13 +2222,12 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, v1, v4 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v3.l, -4.0, v3.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v3, v4, v2, v3 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v3, v0, v3, v4 @@ -2506,13 +2502,12 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX12-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2615,13 +2610,12 @@ define void @local_atomic_fsub_noret_f16(ptr addrspace(3) %ptr) nounwind { ; GFX11-TRUE16-NEXT: .LBB10_1: ; %atomicrmw.start ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v0, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v0, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v2, v3, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v1, v4, v2 @@ -2881,11 +2875,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -2993,11 +2987,11 @@ define void @local_atomic_fsub_noret_f16__offset(ptr addrspace(3) %ptr) nounwind ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v1, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v4.l, -4.0, v4.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v1, v4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v4, v3, v2, v4 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v4, v0, v4, v3 @@ -3257,10 +3251,9 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l -; GFX12-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -3344,10 +3337,9 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:65534 @@ -3557,8 +3549,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX12-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX12-TRUE16-NEXT: s_wait_dscnt 0x0 ; GFX12-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX12-TRUE16-NEXT: s_wait_storecnt 0x0 ; GFX12-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 @@ -3639,8 +3631,8 @@ define void @local_atomic_fsub_noret_f16__offset__align4(ptr addrspace(3) %ptr) ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v2.l, -4.0, v1.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_and_or_b32 v2, 0xffff0000, v1, v2 ; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-TRUE16-NEXT: ds_cmpstore_rtn_b32 v2, v0, v2, v1 offset:65534 diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 811e25587d3d..eab92668c536 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -2382,13 +2382,22 @@ define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %sr } define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { -; GFX1100-LABEL: mixlo_zext: -; GFX1100: ; %bb.0: -; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX1100-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX1100-TRUE16-LABEL: mixlo_zext: +; SDAG-GFX1100-TRUE16: ; %bb.0: +; SDAG-GFX1100-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; SDAG-GFX1100-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; SDAG-GFX1100-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; SDAG-GFX1100-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX1100-FAKE16-LABEL: mixlo_zext: +; SDAG-GFX1100-FAKE16: ; %bb.0: +; SDAG-GFX1100-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-GFX1100-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; SDAG-GFX1100-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX1100-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX1100-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX900-LABEL: mixlo_zext: ; GFX900: ; %bb.0: @@ -2418,6 +2427,14 @@ define i32 @mixlo_zext(float %src0, float %src1, float %src2) #0 { ; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2 ; SDAG-CI-NEXT: s_setpc_b64 s[30:31] ; +; GISEL-GFX1100-LABEL: mixlo_zext: +; GISEL-GFX1100: ; %bb.0: +; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX1100-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31] +; ; GISEL-CI-LABEL: mixlo_zext: ; GISEL-CI: ; %bb.0: ; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/mad.u16.ll b/llvm/test/CodeGen/AMDGPU/mad.u16.ll index ef80323a98ec..fbf8011fd40c 100644 --- a/llvm/test/CodeGen/AMDGPU/mad.u16.ll +++ b/llvm/test/CodeGen/AMDGPU/mad.u16.ll @@ -179,8 +179,7 @@ define i32 @v_mad_u16_zext(i16 %arg0, i16 %arg1, i16 %arg2) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_mad_u16_zext: @@ -222,9 +221,9 @@ define i64 @v_mad_u16_zext64(i16 %arg0, i16 %arg1, i16 %arg2) { ; GFX11-TRUE16-LABEL: v_mad_u16_zext64: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: v_mad_u16 v0.l, v0.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: v_mad_u16_zext64: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index cf9a700cd64f..b8f9571ccc2e 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 < %s | FileCheck -check-prefixes=GFX11,GFX1150 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx11-generic --amdhsa-code-object-version=6 < %s | FileCheck -check-prefixes=GFX11,GFX1100 %s ; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok. @@ -54,6 +55,13 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_sextops: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -106,6 +114,13 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_sextops_commute: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -158,6 +173,13 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_u64_u32_zextops: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = zext i32 %arg0 to i64 %sext1 = zext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -210,6 +232,13 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_u64_u32_zextops_commute: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = zext i32 %arg0 to i64 %sext1 = zext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -393,6 +422,38 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffd ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_sextops_i32_i128: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v9, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v1, v9 +; GFX1250-NEXT: v_mov_b32_e32 v21, v9 +; GFX1250-NEXT: v_mul_u64_e32 v[10:11], v[0:1], v[8:9] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_ashrrev_i32 v12, 31, v0 :: v_dual_mov_b32 v8, v11 +; GFX1250-NEXT: v_dual_ashrrev_i32 v7, 31, v6 :: v_dual_mov_b32 v13, v12 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[14:15], v12, v6, v[8:9] +; GFX1250-NEXT: v_mul_u64_e32 v[16:17], v[6:7], v[12:13] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mov_b32_e32 v8, v14 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[18:19], v0, v7, v[8:9] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v8, v15 :: v_dual_mov_b32 v20, v19 +; GFX1250-NEXT: v_add_nc_u64_e32 v[8:9], v[8:9], v[20:21] +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v7, v0, v[16:17] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[8:9], v12, v7, v[8:9] +; GFX1250-NEXT: v_add_nc_u64_e32 v[6:7], v[8:9], v[0:1] +; GFX1250-NEXT: v_add_co_u32 v0, vcc_lo, v10, v2 +; GFX1250-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v18, v3, vcc_lo +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX1250-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v6, v4, vcc_lo +; GFX1250-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v7, v5, vcc_lo +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i128 %sext1 = sext i32 %arg1 to i128 %mul = mul i128 %sext0, %sext1 @@ -445,6 +506,13 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_sextops_i32_i63: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i63 %sext1 = sext i32 %arg1 to i63 %mul = mul i63 %sext0, %sext1 @@ -510,6 +578,16 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_sextops_i31_i63: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 31 +; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 31 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[2:3] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i31 %arg0 to i63 %sext1 = sext i31 %arg1 to i63 %mul = mul i63 %sext0, %sext1 @@ -585,6 +663,17 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_extops_i32_i64: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v5, v4, v[2:3] +; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v5 +; GFX1250-NEXT: v_mad_u32 v1, v2, v4, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %ext0 = sext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 %mul = mul i64 %ext0, %ext1 @@ -637,6 +726,13 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_u64_u32_bitops: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v2, v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 4294967295 %mul = mul i64 %trunc.lhs, %trunc.rhs @@ -711,6 +807,17 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_u64_u32_bitops_lhs_mask_small: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v3, v1 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v0, v2, v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX1250-NEXT: v_mad_u32 v1, v3, v2, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 %trunc.rhs = and i64 %arg1, 4294967295 %mul = mul i64 %trunc.lhs, %trunc.rhs @@ -786,6 +893,17 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX12-NEXT: v_and_b32_e32 v2, 1, v3 ; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_u64_u32_bitops_rhs_mask_small: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mov_b32_e32 v6, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v2, v[4:5] +; GFX1250-NEXT: v_and_b32_e32 v2, 1, v3 +; GFX1250-NEXT: v_mad_u32 v1, v6, v2, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 8589934591 %mul = mul i64 %trunc.lhs, %trunc.rhs @@ -838,6 +956,13 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_bitops: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v2, v[4:5] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %shl.lhs = shl i64 %arg0, 32 %trunc.lhs = ashr i64 %shl.lhs, 32 %shl.rhs = shl i64 %arg1, 32 @@ -893,6 +1018,13 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { ; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_unpack_i64ops: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v1, v0, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %tmp4 = lshr i64 %arg0, 32 %tmp5 = and i64 %arg0, 4294967295 %mul = mul nuw i64 %tmp4, %tmp5 @@ -982,6 +1114,25 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX12-NEXT: s_endpgm +; +; GFX1250-LABEL: mad_i64_i32_uniform: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX1250-NEXT: s_wait_xcnt 0x0 +; GFX1250-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX1250-NEXT: s_mov_b32 s7, 0 +; GFX1250-NEXT: v_mov_b32_e32 v2, 0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: s_mov_b32 s6, s2 +; GFX1250-NEXT: s_mov_b32 s2, s3 +; GFX1250-NEXT: s_mov_b32 s3, s7 +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_mul_u64 s[2:3], s[6:7], s[2:3] +; GFX1250-NEXT: s_add_nc_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3] +; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX1250-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 %mul = mul i64 %ext0, %ext1 @@ -1055,6 +1206,17 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 { ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_twice: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_i64_i32 v[2:3], v0, v1, v[2:3] +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], v0, v1, v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX1250-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -1174,6 +1336,26 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_thrice: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_ashrrev_i32 v1, 31, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[8:9] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], v[0:1], v[4:5] +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[6:7] +; GFX1250-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_xor_b32_e32 v3, v3, v5 +; GFX1250-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -1256,6 +1438,21 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 ; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i64_i32_secondary_use: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_ashrrev_i32 v1, 31, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GFX1250-NEXT: v_mul_u64_e32 v[0:1], v[0:1], v[4:5] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[2:3], v[0:1], v[2:3] +; GFX1250-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1250-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -1328,6 +1525,18 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: mad_i48_i48: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], v6, v2, v[4:5] +; GFX1250-NEXT: v_mad_u32 v1, v7, v2, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_u32 v1, v6, v3, v1 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %m = mul i48 %arg0, %arg1 %a = add i48 %m, %arg2 ret i48 %a @@ -1391,6 +1600,15 @@ define i64 @lshr_mad_i64_1(i64 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc19, v2, v[0:1] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xfffffc19, v2, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr i64 %arg0, 32 %mul = mul i64 %lsh, s0xfffffffffffffc19 %mad = add i64 %mul, %arg0 @@ -1456,6 +1674,15 @@ define i64 @lshr_mad_i64_2(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xd1, v2, v[0:1] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_2: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xd1, v2, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr i64 %arg0, 32 %mul = mul i64 %lsh, s0xffffffff000000d1 %mad = add i64 %mul, %arg0 @@ -1521,6 +1748,15 @@ define i64 @lshr_mad_i64_3(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v2, v[0:1] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v2, v1 :: v_dual_mov_b32 v1, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xfffffc88, v2, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr i64 %arg0, 32 %mul = mul i64 s0xfffffffffffffc88, %lsh %mad = add i64 %mul, %arg0 @@ -1602,6 +1838,19 @@ define i64 @lshr_mad_i64_4(i32 %arg0, i64 %arg1) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, 0xfffffc88, v0, v[3:4] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 +; GFX1250-NEXT: v_mov_b32_e32 v1, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_mul_u64_e32 v[2:3], v[2:3], v[0:1] +; GFX1250-NEXT: v_mov_b32_e32 v0, v2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_u64_u32 v[0:1], 0xfffffc88, v3, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %ext = zext i32 %arg0 to i64 %mul1 = mul i64 %arg1, %ext %lsh = lshr i64 %mul1, 32 @@ -1666,6 +1915,15 @@ define i64 @lshr_mad_i64_negative_1(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, 0xfffffc19, v2, v[0:1] ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_negative_1: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 4, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_mad_nc_i64_i32 v[0:1], 0xfffffc19, v2, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr i64 %arg0, 36 %mul = mul i64 %lsh, s0xfffffffffffffc19 %mad = add i64 %mul, %arg0 @@ -1729,6 +1987,16 @@ define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 { ; GFX12-NEXT: v_sub_nc_u32_e32 v1, v3, v0 ; GFX12-NEXT: v_mov_b32_e32 v0, v2 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_negative_2: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], 0xd1, v1, v[0:1] +; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 8, v1 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_sub_nc_u32 v1, v3, v0 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr i64 %arg0, 32 %mul = mul i64 %lsh, s0xffffff00000000d1 %mad = add i64 %mul, %arg0 @@ -1803,6 +2071,18 @@ define i64 @lshr_mad_i64_negative_3(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_negative_3: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_lshrrev_b64 v[2:3], 22, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_and_b32_e32 v2, 0xfffffc00, v2 +; GFX1250-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_add_nc_u64_e32 v[0:1], 1, v[0:1] +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %op = add i64 %arg0, 1 %lsh = lshr i64 %arg0, 32 %mul = mul i64 %lsh, s0xfffffffffffffc00 @@ -1878,6 +2158,16 @@ define i64 @lshr_mad_i64_negative_4(i64 %arg0) #0 { ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_negative_4: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[2:3], v1, v0, v[0:1] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_mad_u32 v1, v1, v1, v3 +; GFX1250-NEXT: v_mov_b32_e32 v0, v2 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr i64 %arg0, 32 %mul = mul i64 %lsh, %arg0 %mad = add i64 %mul, %arg0 @@ -1938,6 +2228,16 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 { ; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] ; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] ; GFX12-NEXT: ; return to shader part epilog +; +; GFX1250-LABEL: lshr_mad_i64_sgpr: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_mov_b32 s3, 0 +; GFX1250-NEXT: s_mov_b32 s2, s1 +; GFX1250-NEXT: s_mov_b64 s[4:5], lit64(0xffffffffffff1c18) +; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1250-NEXT: s_mul_u64 s[2:3], s[2:3], s[4:5] +; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX1250-NEXT: ; return to shader part epilog %lsh = lshr i64 %arg0, 32 %mul = mul i64 %lsh, s0xffffffffffff1c18 %mad = add i64 %mul, %arg0 @@ -2018,6 +2318,17 @@ define <2 x i64> @lshr_mad_i64_vec(<2 x i64> %arg0) #0 { ; GFX12-NEXT: v_sub_nc_u32_e32 v3, v7, v3 ; GFX12-NEXT: v_mov_b32_e32 v2, v6 ; GFX12-NEXT: s_setpc_b64 s[30:31] +; +; GFX1250-LABEL: lshr_mad_i64_vec: +; GFX1250: ; %bb.0: +; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX1250-NEXT: s_wait_kmcnt 0x0 +; GFX1250-NEXT: v_mad_nc_u64_u32 v[4:5], 0xffff1c18, v1, v[0:1] +; GFX1250-NEXT: v_mad_nc_u64_u32 v[6:7], 0xffff1118, v3, v[2:3] +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1250-NEXT: v_dual_sub_nc_u32 v1, v5, v1 :: v_dual_sub_nc_u32 v3, v7, v3 +; GFX1250-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v2, v6 +; GFX1250-NEXT: s_set_pc_i64 s[30:31] %lsh = lshr <2 x i64> %arg0, %mul = mul <2 x i64> %lsh, %mad = add <2 x i64> %mul, %arg0 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 3ce09475c094..79910af5c043 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -374,7 +374,7 @@ define i32 @shl_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: shl_i16_zext_i32: @@ -412,7 +412,7 @@ define i32 @lshr_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_lshrrev_b16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: lshr_i16_zext_i32: @@ -450,7 +450,7 @@ define i32 @ashr_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, v1.l, v0.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: ashr_i16_zext_i32: @@ -488,7 +488,7 @@ define i32 @add_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: add_u16_zext_i32: @@ -526,7 +526,7 @@ define i32 @sub_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: sub_u16_zext_i32: @@ -564,7 +564,7 @@ define i32 @mul_lo_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: mul_lo_u16_zext_i32: @@ -602,7 +602,7 @@ define i32 @min_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: min_u16_zext_i32: @@ -641,7 +641,7 @@ define i32 @min_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_min_i16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: min_i16_zext_i32: @@ -680,7 +680,7 @@ define i32 @max_u16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_u16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: max_u16_zext_i32: @@ -719,7 +719,7 @@ define i32 @max_i16_zext_i32(i16 %x, i16 %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_max_i16 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: max_i16_zext_i32: @@ -758,7 +758,7 @@ define i32 @zext_fadd_f16(half %x, half %y) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fadd_f16: @@ -797,8 +797,10 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) { ; GFX11-TRUE16-LABEL: zext_fma_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fma_f16: @@ -838,7 +840,7 @@ define i32 @zext_div_fixup_f16(half %x, half %y, half %z) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_div_fixup_f16 v0.l, v0.l, v1.l, v2.l -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_div_fixup_f16: @@ -880,7 +882,7 @@ define i32 @zext_fptrunc_f16(float %x) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: zext_fptrunc_f16: @@ -924,12 +926,20 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { ; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: zext_fptrunc_fma_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: zext_fptrunc_fma_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_mixlo_f16 v1, v0, v1, v2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: zext_fptrunc_fma_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fma = call float @llvm.fma.f32(float %x, float %y, float %z) %fptrunc = fptrunc float %fma to half %cast = bitcast half %fptrunc to i16 @@ -940,3 +950,5 @@ define i32 @zext_fptrunc_fma_f16(float %x, float %y, float %z) { declare half @llvm.amdgcn.div.fixup.f16(half, half, half) declare half @llvm.fma.f16(half, half, half) declare float @llvm.fma.f32(float, float, float) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 21aa40d69998..91c88ec5e718 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -1528,10 +1528,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_sub_nc_u16 v0.l, v0.l, 64 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-SDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-SDAG-TRUE16-NEXT: s_endpgm ; @@ -1560,10 +1559,9 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_zext_to_i32(ptr addrspace(1) %out ; GFX11-GISEL-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: global_load_d16_b16 v0, v0, s[2:3] +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0 ; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) ; GFX11-GISEL-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xffc0, v0.l -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-GISEL-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-TRUE16-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll index 30ed6ae5484c..334215125f58 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-add.ll @@ -300,17 +300,15 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v4i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -348,17 +346,15 @@ define i8 @test_vector_reduce_add_v4i8(<4 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v0.h ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -518,21 +514,19 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_add_v8i8: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v6.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v3.l, v7.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v5.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v6.l -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l @@ -581,21 +575,19 @@ define i8 @test_vector_reduce_add_v8i8(<8 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v6.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v3.l, v7.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v5.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v6.l -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l @@ -832,28 +824,25 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v5.l, v13.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v9.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.h, v7.l, v15.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v3.l, v11.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v6.l, v14.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v5.l, v7.l, v15.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v6.l, v14.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, v10.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, v12.l -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v3.l, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v3.l, v11.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v4.l, v12.l ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v2.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.h, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v3.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX11-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] @@ -922,28 +911,25 @@ define i8 @test_vector_reduce_add_v16i8(<16 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v5.l, v13.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v9.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.h, v7.l, v15.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v3.l, v11.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v6.l, v14.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v5.l, v7.l, v15.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v6.l, v14.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.l, v10.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.h, v4.l, v12.l -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v3.l, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.h, v3.l, v11.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v3.l, v4.l, v12.l ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v8.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v1.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v2.l, v2.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v2.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v2.l, v2.h, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v3.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v2.l, 8, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, v1.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 ; GFX12-SDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll index aab0e76410cc..1d3b42ee43b0 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-umin.ll @@ -374,13 +374,12 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v0.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -427,13 +426,12 @@ define i8 @test_vector_reduce_umin_v4i8(<4 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v0.h, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -624,22 +622,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l ; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v3.l, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v2.l, v1.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -703,22 +699,20 @@ define i8 @test_vector_reduce_umin_v8i8(<8 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, 0xff, v7.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, 0xff, v3.l -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l ; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v1.l, v1.l, v1.h -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, 0xff, v2.l +; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.h, v1.l, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v6.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v1.l, v1.l, v3.l, v3.h -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v1.l, 8, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v6.l -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.h, v0.h, v3.l, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0 +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v3.l, 8, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v2.l, v1.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2 +; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3 ; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1047,14 +1041,12 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) { ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l ; GFX11-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1176,14 +1168,12 @@ define i8 @test_vector_reduce_umin_v16i8(<16 x i8> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX12-SDAG-TRUE16-NEXT: v_min3_u16 v0.l, v0.l, v1.h, v1.l ; GFX12-SDAG-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v0.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-TRUE16-NEXT: v_min_u16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll b/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll index 344bb15d2a8b..8f798fac06f5 100644 --- a/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll +++ b/llvm/test/CodeGen/ARM/2013-05-05-IfConvertBug.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -; RUN: llc < %s -mtriple=thumbv8 | FileCheck -check-prefix=CHECK-V8 %s -; RUN: llc < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck -check-prefix=CHECK-RESTRICT-IT %s +; RUN: llc -keep-loops="false" < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s +; RUN: llc -keep-loops="false" < %s -mtriple=thumbv8 | FileCheck -check-prefix=CHECK-V8 %s +; RUN: llc -keep-loops="false" < %s -mtriple=thumbv7 -arm-restrict-it | FileCheck -check-prefix=CHECK-RESTRICT-IT %s define i32 @t1(i32 %a, i32 %b, ptr %retaddr) { ; CHECK-LABEL: t1: diff --git a/llvm/test/CodeGen/LoongArch/linker-relaxation.ll b/llvm/test/CodeGen/LoongArch/linker-relaxation.ll index 2827a9554790..6b197bc57891 100644 --- a/llvm/test/CodeGen/LoongArch/linker-relaxation.ll +++ b/llvm/test/CodeGen/LoongArch/linker-relaxation.ll @@ -1,6 +1,6 @@ ; RUN: llc --mtriple=loongarch64 --filetype=obj -mattr=-relax \ ; RUN: --relocation-model=pic --code-model=medium < %s \ -; RUN: | llvm-readobj -r - | FileCheck --check-prefixes=CHECK-RELOC,PCALA-RELOC %s +; RUN: | llvm-readobj -r - | FileCheck --check-prefix=CHECK-RELOC %s ; RUN: llc --mtriple=loongarch64 --filetype=obj -mattr=+relax \ ; RUN: --relocation-model=pic --code-model=medium < %s \ ; RUN: | llvm-readobj -r - | FileCheck --check-prefixes=CHECK-RELOC,RELAX %s @@ -29,16 +29,12 @@ declare void @callee1() nounwind declare dso_local void @callee2() nounwind declare dso_local void @callee3() nounwind -define ptr @caller() nounwind { -; RELAX: R_LARCH_ALIGN - 0x1C ; CHECK-RELOC: R_LARCH_GOT_PC_HI20 g_e 0x0 -; RELAX-NEXT: R_LARCH_RELAX - 0x0 +; RELAX: R_LARCH_RELAX - 0x0 ; CHECK-RELOC-NEXT: R_LARCH_GOT_PC_LO12 g_e 0x0 ; RELAX-NEXT: R_LARCH_RELAX - 0x0 -; PCALA-RELOC: R_LARCH_PCALA_HI20 .bss 0x0 -; RELAX-NEXT: R_LARCH_PCALA_HI20 g_i 0x0 -; PCALA-RELOC: R_LARCH_PCALA_LO12 .bss 0x0 -; RELAX-NEXT: R_LARCH_PCALA_LO12 g_i 0x0 +; CHECK-RELOC-NEXT: R_LARCH_PCALA_HI20 g_i 0x0 +; CHECK-RELOC-NEXT: R_LARCH_PCALA_LO12 g_i 0x0 ; CHECK-RELOC: R_LARCH_TLS_GD_PC_HI20 t_un 0x0 ; RELAX-NEXT: R_LARCH_RELAX - 0x0 ; CHECK-RELOC-NEXT: R_LARCH_GOT_PC_LO12 t_un 0x0 @@ -77,26 +73,33 @@ define ptr @caller() nounwind { ; RELAX-NEXT: R_LARCH_RELAX - 0x0 ; CHECK-RELOC-NEXT: R_LARCH_TLS_LE_LO12_R t_le 0x0 ; RELAX-NEXT: R_LARCH_RELAX - 0x0 +; CHECK-RELOC-NEXT: R_LARCH_PCALA_HI20 g_i1 0x0 +; RELAX-NEXT: R_LARCH_RELAX - 0x0 +; CHECK-RELOC-NEXT: R_LARCH_PCALA_LO12 g_i1 0x0 +; RELAX-NEXT: R_LARCH_RELAX - 0x0 +; RELAX-NEXT: R_LARCH_ALIGN - 0x1C ; CHECK-RELOC-NEXT: R_LARCH_CALL36 callee1 0x0 ; RELAX-NEXT: R_LARCH_RELAX - 0x0 ; CHECK-RELOC-NEXT: R_LARCH_CALL36 callee2 0x0 ; RELAX-NEXT: R_LARCH_RELAX - 0x0 ; CHECK-RELOC-NEXT: R_LARCH_CALL36 callee3 0x0 ; RELAX-NEXT: R_LARCH_RELAX - 0x0 -; PCALA-RELOC: R_LARCH_PCALA_HI20 .data 0x0 -; RELAX-NEXT: R_LARCH_PCALA_HI20 g_i1 0x0 -; RELAX-NEXT: R_LARCH_RELAX - 0x0 -; PCALA-RELOC: R_LARCH_PCALA_LO12 .data 0x0 -; RELAX-NEXT: R_LARCH_PCALA_LO12 g_i1 0x0 -; RELAX-NEXT: R_LARCH_RELAX - 0x0 + +;; No ALIGN reloc will emit before the first linker-relaxable instruction. +define ptr @loader() nounwind { %a = load volatile i32, ptr @g_e %b = load volatile i32, ptr @g_i %c = load volatile i32, ptr @t_un %d = load volatile i32, ptr @t_ld %e = load volatile i32, ptr @t_ie %f = load volatile i32, ptr @t_le + ret ptr @g_i1 +} + +;; ALIGN reloc will be emitted here. +define void @caller() nounwind { call i32 @callee1() call i32 @callee2() tail call i32 @callee3() - ret ptr @g_i1 + ret void } diff --git a/llvm/test/CodeGen/LoongArch/xray-attribute-instrumentation.ll b/llvm/test/CodeGen/LoongArch/xray-attribute-instrumentation.ll index 8999c2038700..7838bcea1025 100644 --- a/llvm/test/CodeGen/LoongArch/xray-attribute-instrumentation.ll +++ b/llvm/test/CodeGen/LoongArch/xray-attribute-instrumentation.ll @@ -43,14 +43,14 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" ; CHECK-NEXT: .dword 2 ; RELOC: Section ([[#]]) .relaxray_instr_map { -; RELOC-NEXT: 0x0 R_LARCH_64_PCREL .text 0x0 -; RELOC-NEXT: 0x8 R_LARCH_64_PCREL .text 0x0 -; RELOC-NEXT: 0x20 R_LARCH_64_PCREL .text 0x34 -; RELOC-NEXT: 0x28 R_LARCH_64_PCREL .text 0x0 +; RELOC-NEXT: 0x0 R_LARCH_64_PCREL .L{{.*}} 0x0 +; RELOC-NEXT: 0x8 R_LARCH_64_PCREL .L{{.*}} 0x0 +; RELOC-NEXT: 0x20 R_LARCH_64_PCREL .L{{.*}} 0x0 +; RELOC-NEXT: 0x28 R_LARCH_64_PCREL .L{{.*}} 0x0 ; RELOC-NEXT: } ; RELOC-NEXT: Section ([[#]]) .relaxray_fn_idx { -; RELOC-NEXT: 0x0 R_LARCH_64_PCREL xray_instr_map 0x0 +; RELOC-NEXT: 0x0 R_LARCH_64_PCREL .Lxray_sleds_start0 0x0 ; RELOC-NEXT: } ; RELOC-NEXT: Section ([[#]]) .rela.eh_frame { -; RELOC-NEXT: 0x1C R_LARCH_32_PCREL .text 0x0 +; RELOC-NEXT: 0x1C R_LARCH_32_PCREL .L{{.*}} 0x0 ; RELOC-NEXT: } diff --git a/llvm/test/CodeGen/M68k/Data/load-extend.ll b/llvm/test/CodeGen/M68k/Data/load-extend.ll index 51159730ecc0..687d3f24523d 100644 --- a/llvm/test/CodeGen/M68k/Data/load-extend.ll +++ b/llvm/test/CodeGen/M68k/Data/load-extend.ll @@ -41,3 +41,45 @@ define i32 @"test_zext_pcd_i16_to_i32"() { %val2 = zext i16 %val to i32 ret i32 %val2 } + +define i16 @test_anyext_pcd_i8_to_i16() nounwind { +; CHECK-LABEL: test_anyext_pcd_i8_to_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: move.b (__unnamed_1+4,%pc), %d0 +; CHECK-NEXT: and.l #255, %d0 +; CHECK-NEXT: lsl.w #8, %d0 +; CHECK-NEXT: ; kill: def $wd0 killed $wd0 killed $d0 +; CHECK-NEXT: rts + %copyload = load i8, ptr getelementptr inbounds nuw (i8, ptr @0, i32 4) + %insert_ext = zext i8 %copyload to i16 + %insert_shift = shl i16 %insert_ext, 8 + ret i16 %insert_shift +} + +define i32 @test_anyext_pcd_i8_to_i32() nounwind { +; CHECK-LABEL: test_anyext_pcd_i8_to_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: moveq #24, %d1 +; CHECK-NEXT: move.b (__unnamed_1+4,%pc), %d0 +; CHECK-NEXT: and.l #255, %d0 +; CHECK-NEXT: lsl.l %d1, %d0 +; CHECK-NEXT: rts + %copyload = load i8, ptr getelementptr inbounds nuw (i8, ptr @0, i32 4) + %insert_ext = zext i8 %copyload to i32 + %insert_shift = shl i32 %insert_ext, 24 + ret i32 %insert_shift +} + +define i32 @test_anyext_pcd_i16_to_i32() nounwind { +; CHECK-LABEL: test_anyext_pcd_i16_to_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: moveq #16, %d1 +; CHECK-NEXT: move.w (__unnamed_1+4,%pc), %d0 +; CHECK-NEXT: and.l #65535, %d0 +; CHECK-NEXT: lsl.l %d1, %d0 +; CHECK-NEXT: rts + %copyload = load i16, ptr getelementptr inbounds nuw (i8, ptr @0, i32 4) + %insert_ext = zext i16 %copyload to i32 + %insert_shift = shl i32 %insert_ext, 16 + ret i32 %insert_shift +} diff --git a/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll b/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll index bbfb7cf9ca90..409b1a1f818a 100644 --- a/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll +++ b/llvm/test/CodeGen/Mips/fmuladd-soft-float.ll @@ -49,13 +49,11 @@ define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { ; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 ; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -16 -; SOFT-FLOAT-64-NEXT: move $16, $6 -; SOFT-FLOAT-64-NEXT: sll $4, $4, 0 ; SOFT-FLOAT-64-NEXT: jal __mulsf3 -; SOFT-FLOAT-64-NEXT: sll $5, $5, 0 -; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: move $16, $6 +; SOFT-FLOAT-64-NEXT: move $4, $2 ; SOFT-FLOAT-64-NEXT: jal __addsf3 -; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64-NEXT: move $5, $16 ; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64-NEXT: jr $ra @@ -69,13 +67,11 @@ define float @fmuladd_intrinsic_f32(float %a, float %b, float %c) #0 { ; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 ; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -16 -; SOFT-FLOAT-64R2-NEXT: move $16, $6 -; SOFT-FLOAT-64R2-NEXT: sll $4, $4, 0 ; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 -; SOFT-FLOAT-64R2-NEXT: sll $5, $5, 0 -; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64R2-NEXT: move $16, $6 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 ; SOFT-FLOAT-64R2-NEXT: jal __addsf3 -; SOFT-FLOAT-64R2-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64R2-NEXT: move $5, $16 ; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64R2-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64R2-NEXT: jr $ra @@ -203,13 +199,11 @@ define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { ; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 ; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -16 -; SOFT-FLOAT-64-NEXT: move $16, $6 -; SOFT-FLOAT-64-NEXT: sll $4, $4, 0 ; SOFT-FLOAT-64-NEXT: jal __mulsf3 -; SOFT-FLOAT-64-NEXT: sll $5, $5, 0 -; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: move $16, $6 +; SOFT-FLOAT-64-NEXT: move $4, $2 ; SOFT-FLOAT-64-NEXT: jal __addsf3 -; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64-NEXT: move $5, $16 ; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64-NEXT: jr $ra @@ -223,13 +217,11 @@ define float @fmuladd_contract_f32(float %a, float %b, float %c) #0 { ; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 ; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -16 -; SOFT-FLOAT-64R2-NEXT: move $16, $6 -; SOFT-FLOAT-64R2-NEXT: sll $4, $4, 0 ; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 -; SOFT-FLOAT-64R2-NEXT: sll $5, $5, 0 -; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64R2-NEXT: move $16, $6 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 ; SOFT-FLOAT-64R2-NEXT: jal __addsf3 -; SOFT-FLOAT-64R2-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64R2-NEXT: move $5, $16 ; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64R2-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64R2-NEXT: jr $ra @@ -443,149 +435,169 @@ define <4 x float> @fmuladd_contract_v4f32(<4 x float> %a, <4 x float> %b, <4 x ; ; SOFT-FLOAT-64-LABEL: fmuladd_contract_v4f32: ; SOFT-FLOAT-64: # %bb.0: -; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -64 -; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 64 -; SOFT-FLOAT-64-NEXT: sd $ra, 56($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $22, 48($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $21, 40($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $20, 32($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $19, 24($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $18, 16($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $17, 8($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, -80 +; SOFT-FLOAT-64-NEXT: .cfi_def_cfa_offset 80 +; SOFT-FLOAT-64-NEXT: sd $ra, 72($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $23, 64($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $22, 56($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $21, 48($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $20, 40($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $19, 32($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $18, 24($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $17, 16($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64-NEXT: sd $16, 8($sp) # 8-byte Folded Spill ; SOFT-FLOAT-64-NEXT: .cfi_offset 31, -8 -; SOFT-FLOAT-64-NEXT: .cfi_offset 22, -16 -; SOFT-FLOAT-64-NEXT: .cfi_offset 21, -24 -; SOFT-FLOAT-64-NEXT: .cfi_offset 20, -32 -; SOFT-FLOAT-64-NEXT: .cfi_offset 19, -40 -; SOFT-FLOAT-64-NEXT: .cfi_offset 18, -48 -; SOFT-FLOAT-64-NEXT: .cfi_offset 17, -56 -; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -64 +; SOFT-FLOAT-64-NEXT: .cfi_offset 23, -16 +; SOFT-FLOAT-64-NEXT: .cfi_offset 22, -24 +; SOFT-FLOAT-64-NEXT: .cfi_offset 21, -32 +; SOFT-FLOAT-64-NEXT: .cfi_offset 20, -40 +; SOFT-FLOAT-64-NEXT: .cfi_offset 19, -48 +; SOFT-FLOAT-64-NEXT: .cfi_offset 18, -56 +; SOFT-FLOAT-64-NEXT: .cfi_offset 17, -64 +; SOFT-FLOAT-64-NEXT: .cfi_offset 16, -72 ; SOFT-FLOAT-64-NEXT: move $16, $9 -; SOFT-FLOAT-64-NEXT: move $17, $8 -; SOFT-FLOAT-64-NEXT: move $18, $7 -; SOFT-FLOAT-64-NEXT: move $19, $6 -; SOFT-FLOAT-64-NEXT: move $20, $5 +; SOFT-FLOAT-64-NEXT: move $19, $8 +; SOFT-FLOAT-64-NEXT: move $17, $7 +; SOFT-FLOAT-64-NEXT: move $20, $6 +; SOFT-FLOAT-64-NEXT: move $18, $5 ; SOFT-FLOAT-64-NEXT: move $21, $4 -; SOFT-FLOAT-64-NEXT: sll $4, $4, 0 +; SOFT-FLOAT-64-NEXT: sll $4, $21, 0 ; SOFT-FLOAT-64-NEXT: jal __mulsf3 -; SOFT-FLOAT-64-NEXT: sll $5, $6, 0 +; SOFT-FLOAT-64-NEXT: sll $5, $20, 0 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $19, 0 ; SOFT-FLOAT-64-NEXT: move $22, $2 -; SOFT-FLOAT-64-NEXT: dsra $4, $21, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $18, 0 ; SOFT-FLOAT-64-NEXT: jal __mulsf3 -; SOFT-FLOAT-64-NEXT: dsra $5, $19, 32 -; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 -; SOFT-FLOAT-64-NEXT: jal __addsf3 -; SOFT-FLOAT-64-NEXT: dsra $5, $17, 32 -; SOFT-FLOAT-64-NEXT: # kill: def $v0 killed $v0 def $v0_64 -; SOFT-FLOAT-64-NEXT: sll $4, $22, 0 ; SOFT-FLOAT-64-NEXT: sll $5, $17, 0 -; SOFT-FLOAT-64-NEXT: jal __addsf3 -; SOFT-FLOAT-64-NEXT: dsll $17, $2, 32 -; SOFT-FLOAT-64-NEXT: dsll $1, $2, 32 -; SOFT-FLOAT-64-NEXT: dsrl $1, $1, 32 -; SOFT-FLOAT-64-NEXT: sll $4, $20, 0 -; SOFT-FLOAT-64-NEXT: sll $5, $18, 0 +; SOFT-FLOAT-64-NEXT: move $23, $2 +; SOFT-FLOAT-64-NEXT: dsrl $1, $21, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $1, 0 +; SOFT-FLOAT-64-NEXT: dsrl $1, $20, 32 ; SOFT-FLOAT-64-NEXT: jal __mulsf3 -; SOFT-FLOAT-64-NEXT: or $17, $1, $17 -; SOFT-FLOAT-64-NEXT: move $19, $2 -; SOFT-FLOAT-64-NEXT: dsra $4, $20, 32 -; SOFT-FLOAT-64-NEXT: jal __mulsf3 -; SOFT-FLOAT-64-NEXT: dsra $5, $18, 32 -; SOFT-FLOAT-64-NEXT: sll $4, $2, 0 +; SOFT-FLOAT-64-NEXT: sll $5, $1, 0 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: dsll $1, $22, 32 +; SOFT-FLOAT-64-NEXT: dsrl $2, $19, 32 +; SOFT-FLOAT-64-NEXT: sll $5, $2, 0 ; SOFT-FLOAT-64-NEXT: jal __addsf3 -; SOFT-FLOAT-64-NEXT: dsra $5, $16, 32 +; SOFT-FLOAT-64-NEXT: dsrl $19, $1, 32 ; SOFT-FLOAT-64-NEXT: # kill: def $v0 killed $v0 def $v0_64 -; SOFT-FLOAT-64-NEXT: dsll $18, $2, 32 -; SOFT-FLOAT-64-NEXT: sll $4, $19, 0 -; SOFT-FLOAT-64-NEXT: jal __addsf3 -; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 ; SOFT-FLOAT-64-NEXT: dsll $1, $2, 32 -; SOFT-FLOAT-64-NEXT: dsrl $1, $1, 32 -; SOFT-FLOAT-64-NEXT: or $3, $1, $18 -; SOFT-FLOAT-64-NEXT: move $2, $17 -; SOFT-FLOAT-64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $17, 8($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $18, 16($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $19, 24($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $20, 32($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $21, 40($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $22, 48($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: sll $5, $16, 0 +; SOFT-FLOAT-64-NEXT: or $19, $19, $1 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: move $4, $23 +; SOFT-FLOAT-64-NEXT: move $20, $2 +; SOFT-FLOAT-64-NEXT: dsrl $1, $18, 32 +; SOFT-FLOAT-64-NEXT: sll $4, $1, 0 +; SOFT-FLOAT-64-NEXT: dsrl $1, $17, 32 +; SOFT-FLOAT-64-NEXT: jal __mulsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $1, 0 +; SOFT-FLOAT-64-NEXT: move $4, $2 +; SOFT-FLOAT-64-NEXT: dsll $1, $20, 32 +; SOFT-FLOAT-64-NEXT: dsrl $17, $1, 32 +; SOFT-FLOAT-64-NEXT: dsrl $1, $16, 32 +; SOFT-FLOAT-64-NEXT: jal __addsf3 +; SOFT-FLOAT-64-NEXT: sll $5, $1, 0 +; SOFT-FLOAT-64-NEXT: # kill: def $v0 killed $v0 def $v0_64 +; SOFT-FLOAT-64-NEXT: dsll $1, $2, 32 +; SOFT-FLOAT-64-NEXT: or $3, $17, $1 +; SOFT-FLOAT-64-NEXT: move $2, $19 +; SOFT-FLOAT-64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $17, 16($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $18, 24($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $19, 32($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $20, 40($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $21, 48($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $22, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $23, 64($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64-NEXT: ld $ra, 72($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64-NEXT: jr $ra -; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 64 +; SOFT-FLOAT-64-NEXT: daddiu $sp, $sp, 80 ; ; SOFT-FLOAT-64R2-LABEL: fmuladd_contract_v4f32: ; SOFT-FLOAT-64R2: # %bb.0: -; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -64 -; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 64 -; SOFT-FLOAT-64R2-NEXT: sd $ra, 56($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $22, 48($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $21, 40($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $20, 32($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $19, 24($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $18, 16($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $17, 8($sp) # 8-byte Folded Spill -; SOFT-FLOAT-64R2-NEXT: sd $16, 0($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, -80 +; SOFT-FLOAT-64R2-NEXT: .cfi_def_cfa_offset 80 +; SOFT-FLOAT-64R2-NEXT: sd $ra, 72($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $23, 64($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $22, 56($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $21, 48($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $20, 40($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $19, 32($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $18, 24($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $17, 16($sp) # 8-byte Folded Spill +; SOFT-FLOAT-64R2-NEXT: sd $16, 8($sp) # 8-byte Folded Spill ; SOFT-FLOAT-64R2-NEXT: .cfi_offset 31, -8 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 22, -16 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 21, -24 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 20, -32 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 19, -40 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 18, -48 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 17, -56 -; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -64 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 23, -16 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 22, -24 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 21, -32 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 20, -40 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 19, -48 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 18, -56 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 17, -64 +; SOFT-FLOAT-64R2-NEXT: .cfi_offset 16, -72 ; SOFT-FLOAT-64R2-NEXT: move $16, $9 -; SOFT-FLOAT-64R2-NEXT: move $17, $8 -; SOFT-FLOAT-64R2-NEXT: move $18, $7 -; SOFT-FLOAT-64R2-NEXT: move $19, $6 -; SOFT-FLOAT-64R2-NEXT: move $20, $5 +; SOFT-FLOAT-64R2-NEXT: move $19, $8 +; SOFT-FLOAT-64R2-NEXT: move $17, $7 +; SOFT-FLOAT-64R2-NEXT: move $20, $6 +; SOFT-FLOAT-64R2-NEXT: move $18, $5 ; SOFT-FLOAT-64R2-NEXT: move $21, $4 -; SOFT-FLOAT-64R2-NEXT: dsra $4, $4, 32 -; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 -; SOFT-FLOAT-64R2-NEXT: dsra $5, $6, 32 -; SOFT-FLOAT-64R2-NEXT: move $22, $2 ; SOFT-FLOAT-64R2-NEXT: sll $4, $21, 0 ; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $20, 0 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 ; SOFT-FLOAT-64R2-NEXT: sll $5, $19, 0 -; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 -; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: move $22, $2 +; SOFT-FLOAT-64R2-NEXT: sll $4, $18, 0 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 ; SOFT-FLOAT-64R2-NEXT: sll $5, $17, 0 -; SOFT-FLOAT-64R2-NEXT: sll $4, $22, 0 -; SOFT-FLOAT-64R2-NEXT: dsra $5, $17, 32 +; SOFT-FLOAT-64R2-NEXT: move $23, $2 +; SOFT-FLOAT-64R2-NEXT: dsrl $1, $21, 32 +; SOFT-FLOAT-64R2-NEXT: sll $4, $1, 0 +; SOFT-FLOAT-64R2-NEXT: dsrl $1, $20, 32 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $1, 0 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: dsrl $1, $19, 32 +; SOFT-FLOAT-64R2-NEXT: sll $5, $1, 0 ; SOFT-FLOAT-64R2-NEXT: jal __addsf3 -; SOFT-FLOAT-64R2-NEXT: dext $17, $2, 0, 32 +; SOFT-FLOAT-64R2-NEXT: dext $19, $22, 0, 32 ; SOFT-FLOAT-64R2-NEXT: # kill: def $v0 killed $v0 def $v0_64 ; SOFT-FLOAT-64R2-NEXT: dsll $1, $2, 32 -; SOFT-FLOAT-64R2-NEXT: dsra $4, $20, 32 -; SOFT-FLOAT-64R2-NEXT: dsra $5, $18, 32 -; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 -; SOFT-FLOAT-64R2-NEXT: or $17, $17, $1 -; SOFT-FLOAT-64R2-NEXT: move $19, $2 -; SOFT-FLOAT-64R2-NEXT: sll $4, $20, 0 -; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 -; SOFT-FLOAT-64R2-NEXT: sll $5, $18, 0 -; SOFT-FLOAT-64R2-NEXT: sll $4, $2, 0 -; SOFT-FLOAT-64R2-NEXT: jal __addsf3 ; SOFT-FLOAT-64R2-NEXT: sll $5, $16, 0 -; SOFT-FLOAT-64R2-NEXT: dext $18, $2, 0, 32 -; SOFT-FLOAT-64R2-NEXT: sll $4, $19, 0 +; SOFT-FLOAT-64R2-NEXT: or $19, $19, $1 ; SOFT-FLOAT-64R2-NEXT: jal __addsf3 -; SOFT-FLOAT-64R2-NEXT: dsra $5, $16, 32 +; SOFT-FLOAT-64R2-NEXT: move $4, $23 +; SOFT-FLOAT-64R2-NEXT: move $20, $2 +; SOFT-FLOAT-64R2-NEXT: dsrl $1, $18, 32 +; SOFT-FLOAT-64R2-NEXT: sll $4, $1, 0 +; SOFT-FLOAT-64R2-NEXT: dsrl $1, $17, 32 +; SOFT-FLOAT-64R2-NEXT: jal __mulsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $1, 0 +; SOFT-FLOAT-64R2-NEXT: move $4, $2 +; SOFT-FLOAT-64R2-NEXT: dext $17, $20, 0, 32 +; SOFT-FLOAT-64R2-NEXT: dsrl $1, $16, 32 +; SOFT-FLOAT-64R2-NEXT: jal __addsf3 +; SOFT-FLOAT-64R2-NEXT: sll $5, $1, 0 ; SOFT-FLOAT-64R2-NEXT: # kill: def $v0 killed $v0 def $v0_64 ; SOFT-FLOAT-64R2-NEXT: dsll $1, $2, 32 -; SOFT-FLOAT-64R2-NEXT: or $3, $18, $1 -; SOFT-FLOAT-64R2-NEXT: move $2, $17 -; SOFT-FLOAT-64R2-NEXT: ld $16, 0($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $17, 8($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $18, 16($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $19, 24($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $20, 32($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $21, 40($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $22, 48($sp) # 8-byte Folded Reload -; SOFT-FLOAT-64R2-NEXT: ld $ra, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: or $3, $17, $1 +; SOFT-FLOAT-64R2-NEXT: move $2, $19 +; SOFT-FLOAT-64R2-NEXT: ld $16, 8($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $17, 16($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $18, 24($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $19, 32($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $20, 40($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $21, 48($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $22, 56($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $23, 64($sp) # 8-byte Folded Reload +; SOFT-FLOAT-64R2-NEXT: ld $ra, 72($sp) # 8-byte Folded Reload ; SOFT-FLOAT-64R2-NEXT: jr $ra -; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 64 +; SOFT-FLOAT-64R2-NEXT: daddiu $sp, $sp, 80 %product = fmul contract <4 x float> %a, %b %result = fadd contract <4 x float> %product, %c ret <4 x float> %result diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll index a28b818b3db1..940e018ebdc9 100644 --- a/llvm/test/CodeGen/RISCV/attributes.ll +++ b/llvm/test/CodeGen/RISCV/attributes.ll @@ -173,8 +173,8 @@ ; RUN: llc -mtriple=riscv32 -mattr=+smmpm %s -o - | FileCheck --check-prefix=RV32SMMPM %s ; RUN: llc -mtriple=riscv32 -mattr=+sspm %s -o - | FileCheck --check-prefix=RV32SSPM %s ; RUN: llc -mtriple=riscv32 -mattr=+supm %s -o - | FileCheck --check-prefix=RV32SUPM %s -; RUN: llc -mtriple=riscv32 -mattr=+experimental-smctr %s -o - | FileCheck --check-prefix=RV32SMCTR %s -; RUN: llc -mtriple=riscv32 -mattr=+experimental-ssctr %s -o - | FileCheck --check-prefix=RV32SSCTR %s +; RUN: llc -mtriple=riscv32 -mattr=+smctr %s -o - | FileCheck --check-prefix=RV32SMCTR %s +; RUN: llc -mtriple=riscv32 -mattr=+ssctr %s -o - | FileCheck --check-prefix=RV32SSCTR %s ; RUN: llc -mtriple=riscv64 %s -o - | FileCheck %s ; RUN: llc -mtriple=riscv64 -mattr=+m %s -o - | FileCheck --check-prefixes=CHECK,RV64M %s @@ -336,8 +336,8 @@ ; RUN: llc -mtriple=riscv64 -mattr=+smmpm %s -o - | FileCheck --check-prefix=RV64SMMPM %s ; RUN: llc -mtriple=riscv64 -mattr=+sspm %s -o - | FileCheck --check-prefix=RV64SSPM %s ; RUN: llc -mtriple=riscv64 -mattr=+supm %s -o - | FileCheck --check-prefix=RV64SUPM %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-smctr %s -o - | FileCheck --check-prefix=RV64SMCTR %s -; RUN: llc -mtriple=riscv64 -mattr=+experimental-ssctr %s -o - | FileCheck --check-prefix=RV64SSCTR %s +; RUN: llc -mtriple=riscv64 -mattr=+smctr %s -o - | FileCheck --check-prefix=RV64SMCTR %s +; RUN: llc -mtriple=riscv64 -mattr=+ssctr %s -o - | FileCheck --check-prefix=RV64SSCTR %s ; RUN: llc -mtriple=riscv64 -mattr=+sdext %s -o - | FileCheck --check-prefix=RV64SDEXT %s ; RUN: llc -mtriple=riscv64 -mattr=+sdtrig %s -o - | FileCheck --check-prefix=RV64SDTRIG %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-xqccmp %s -o - | FileCheck --check-prefix=RV64XQCCMP %s diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index fb539211fcc3..f966f800589b 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -27,8 +27,6 @@ ; CHECK-NEXT: experimental - Experimental intrinsics. ; CHECK-NEXT: experimental-p - 'P' ('Base P' (Packed SIMD)). ; CHECK-NEXT: experimental-rvm23u32 - RISC-V experimental-rvm23u32 profile. -; CHECK-NEXT: experimental-smctr - 'Smctr' (Control Transfer Records Machine Level). -; CHECK-NEXT: experimental-ssctr - 'Ssctr' (Control Transfer Records Supervisor Level). ; CHECK-NEXT: experimental-svukte - 'Svukte' (Address-Independent Latency of User-Mode Faults to Supervisor Addresses). ; CHECK-NEXT: experimental-xqccmp - 'Xqccmp' (Qualcomm 16-bit Push/Pop and Double Moves). ; CHECK-NEXT: experimental-xqcia - 'Xqcia' (Qualcomm uC Arithmetic Extension). @@ -145,6 +143,7 @@ ; CHECK-NEXT: smcdeleg - 'Smcdeleg' (Counter Delegation Machine Level). ; CHECK-NEXT: smcntrpmf - 'Smcntrpmf' (Cycle and Instret Privilege Mode Filtering). ; CHECK-NEXT: smcsrind - 'Smcsrind' (Indirect CSR Access Machine Level). +; CHECK-NEXT: smctr - 'Smctr' (Control Transfer Records Machine Level). ; CHECK-NEXT: smdbltrp - 'Smdbltrp' (Double Trap Machine Level). ; CHECK-NEXT: smepmp - 'Smepmp' (Enhanced Physical Memory Protection). ; CHECK-NEXT: smmpm - 'Smmpm' (Machine-level Pointer Masking for M-mode). @@ -157,6 +156,7 @@ ; CHECK-NEXT: sscofpmf - 'Sscofpmf' (Count Overflow and Mode-Based Filtering). ; CHECK-NEXT: sscounterenw - 'Sscounterenw' (Support writeable scounteren enable bit for any hpmcounter that is not read-only zero). ; CHECK-NEXT: sscsrind - 'Sscsrind' (Indirect CSR Access Supervisor Level). +; CHECK-NEXT: ssctr - 'Ssctr' (Control Transfer Records Supervisor Level). ; CHECK-NEXT: ssdbltrp - 'Ssdbltrp' (Double Trap Supervisor Level). ; CHECK-NEXT: ssnpm - 'Ssnpm' (Supervisor-level Pointer Masking for next lower privilege mode). ; CHECK-NEXT: sspm - 'Sspm' (Indicates Supervisor-mode Pointer Masking). @@ -217,6 +217,7 @@ ; CHECK-NEXT: xsfvqmaccqoq - 'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4)). ; CHECK-NEXT: xsifivecdiscarddlone - 'XSiFivecdiscarddlone' (SiFive sf.cdiscard.d.l1 Instruction). ; CHECK-NEXT: xsifivecflushdlone - 'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction). +; CHECK-NEXT: xsmtvdot - 'XSMTVDot' (SpacemiT Vector Dot Product Extension). ; CHECK-NEXT: xtheadba - 'XTHeadBa' (T-Head address calculation instructions). ; CHECK-NEXT: xtheadbb - 'XTHeadBb' (T-Head basic bit-manipulation instructions). ; CHECK-NEXT: xtheadbs - 'XTHeadBs' (T-Head single-bit instructions). diff --git a/llvm/test/CodeGen/RISCV/inline-asm-fixed-v-constraint.ll b/llvm/test/CodeGen/RISCV/inline-asm-fixed-v-constraint.ll new file mode 100644 index 000000000000..2c698adc201f --- /dev/null +++ b/llvm/test/CodeGen/RISCV/inline-asm-fixed-v-constraint.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV32I %s +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefix=RV64I %s + +define <1 x i8> @constraint_vr_fixed(<1 x i8> %0, <1 x i8> %1) nounwind { +; RV32I-LABEL: constraint_vr_fixed: +; RV32I: # %bb.0: +; RV32I-NEXT: #APP +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_vr_fixed: +; RV64I: # %bb.0: +; RV64I-NEXT: #APP +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret + %a = tail call <1 x i8> asm "vadd.vv $0, $1, $2", "=^vr,^vr,^vr"( + <1 x i8> %0, <1 x i8> %1) + ret <1 x i8> %a +} + +define <4 x i32> @constraint_vd_fixed(<4 x i32> %0, <4 x i32> %1) nounwind { +; RV32I-LABEL: constraint_vd_fixed: +; RV32I: # %bb.0: +; RV32I-NEXT: #APP +; RV32I-NEXT: vadd.vv v8, v8, v9 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_vd_fixed: +; RV64I: # %bb.0: +; RV64I-NEXT: #APP +; RV64I-NEXT: vadd.vv v8, v8, v9 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret + %a = tail call <4 x i32> asm "vadd.vv $0, $1, $2", "=^vd,^vr,^vr"( + <4 x i32> %0, <4 x i32> %1) + ret <4 x i32> %a +} + +define <16 x i1> @constraint_vm_fixed(<16 x i1> %0, <16 x i1> %1) nounwind { +; RV32I-LABEL: constraint_vm_fixed: +; RV32I: # %bb.0: +; RV32I-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV32I-NEXT: vmv1r.v v9, v0 +; RV32I-NEXT: vmv1r.v v0, v8 +; RV32I-NEXT: #APP +; RV32I-NEXT: vadd.vv v0, v9, v0 +; RV32I-NEXT: #NO_APP +; RV32I-NEXT: ret +; +; RV64I-LABEL: constraint_vm_fixed: +; RV64I: # %bb.0: +; RV64I-NEXT: vsetivli zero, 1, e8, m1, ta, ma +; RV64I-NEXT: vmv1r.v v9, v0 +; RV64I-NEXT: vmv1r.v v0, v8 +; RV64I-NEXT: #APP +; RV64I-NEXT: vadd.vv v0, v9, v0 +; RV64I-NEXT: #NO_APP +; RV64I-NEXT: ret + %a = tail call <16 x i1> asm "vadd.vv $0, $1, $2", "=^vr,^vr,^vm"( + <16 x i1> %0, <16 x i1> %1) + ret <16 x i1> %a +} diff --git a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll index 5cabb8c53e26..6f1d168358e2 100644 --- a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll +++ b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll @@ -364,6 +364,19 @@ define i32 @sexti1_i32_2(i1 %a) { ret i32 %1 } +; Make sure we don't use not+nds.bfos +define zeroext i8 @sexti1_i32_setcc(i32 signext %a) { +; CHECK-LABEL: sexti1_i32_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: srli a0, a0, 31 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: zext.b a0, a0 +; CHECK-NEXT: ret + %icmp = icmp sgt i32 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define i32 @sexti8_i32(i32 %a) { ; CHECK-LABEL: sexti8_i32: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll index 723437a610ff..784f08ca616c 100644 --- a/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv32xtheadbb.ll @@ -314,6 +314,26 @@ define i32 @sexti1_i32_2(i1 %a) nounwind { ret i32 %sext } +; Make sure we don't use not+th.ext +define zeroext i8 @sexti1_i32_setcc(i32 signext %a) { +; RV32I-LABEL: sexti1_i32_setcc: +; RV32I: # %bb.0: +; RV32I-NEXT: srli a0, a0, 31 +; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: zext.b a0, a0 +; RV32I-NEXT: ret +; +; RV32XTHEADBB-LABEL: sexti1_i32_setcc: +; RV32XTHEADBB: # %bb.0: +; RV32XTHEADBB-NEXT: srli a0, a0, 31 +; RV32XTHEADBB-NEXT: addi a0, a0, -1 +; RV32XTHEADBB-NEXT: zext.b a0, a0 +; RV32XTHEADBB-NEXT: ret + %icmp = icmp sgt i32 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define i32 @sextb_i32(i32 %a) nounwind { ; RV32I-LABEL: sextb_i32: ; RV32I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll index 98cda4266516..406e5247ae0d 100644 --- a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll +++ b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll @@ -277,6 +277,19 @@ define signext i32 @sexti1_i32_2(i1 %a) { ret i32 %1 } +; Make sure we don't use not+nds.bfos +define zeroext i8 @sexti1_i32_setcc(i32 signext %a) { +; CHECK-LABEL: sexti1_i32_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: srli a0, a0, 63 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: zext.b a0, a0 +; CHECK-NEXT: ret + %icmp = icmp sgt i32 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define signext i32 @sexti8_i32(i32 signext %a) { ; CHECK-LABEL: sexti8_i32: ; CHECK: # %bb.0: @@ -334,6 +347,19 @@ define i64 @sexti1_i64_2(i1 %a) { ret i64 %1 } +; Make sure we don't use not+nds.bfos +define zeroext i8 @sexti1_i64_setcc(i64 %a) { +; CHECK-LABEL: sexti1_i64_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: srli a0, a0, 63 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: zext.b a0, a0 +; CHECK-NEXT: ret + %icmp = icmp sgt i64 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define i64 @sexti8_i64(i64 %a) { ; CHECK-LABEL: sexti8_i64: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll index 81acb4f72413..c7902342f7f0 100644 --- a/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll +++ b/llvm/test/CodeGen/RISCV/rv64xtheadbb.ll @@ -635,6 +635,26 @@ define signext i32 @sexti1_i32_2(i1 %a) nounwind { ret i32 %sext } +; Make sure we don't use not+th.ext +define zeroext i8 @sexti1_i32_setcc(i32 signext %a) { +; RV64I-LABEL: sexti1_i32_setcc: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a0, a0, 63 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: zext.b a0, a0 +; RV64I-NEXT: ret +; +; RV64XTHEADBB-LABEL: sexti1_i32_setcc: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: srli a0, a0, 63 +; RV64XTHEADBB-NEXT: addi a0, a0, -1 +; RV64XTHEADBB-NEXT: zext.b a0, a0 +; RV64XTHEADBB-NEXT: ret + %icmp = icmp sgt i32 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define i64 @sexti1_i64(i64 %a) nounwind { ; RV64I-LABEL: sexti1_i64: ; RV64I: # %bb.0: @@ -666,6 +686,26 @@ define i64 @sexti1_i64_2(i1 %a) nounwind { ret i64 %sext } +; Make sure we don't use not+th.ext +define zeroext i8 @sexti1_i64_setcc(i64 %a) { +; RV64I-LABEL: sexti1_i64_setcc: +; RV64I: # %bb.0: +; RV64I-NEXT: srli a0, a0, 63 +; RV64I-NEXT: addi a0, a0, -1 +; RV64I-NEXT: zext.b a0, a0 +; RV64I-NEXT: ret +; +; RV64XTHEADBB-LABEL: sexti1_i64_setcc: +; RV64XTHEADBB: # %bb.0: +; RV64XTHEADBB-NEXT: srli a0, a0, 63 +; RV64XTHEADBB-NEXT: addi a0, a0, -1 +; RV64XTHEADBB-NEXT: zext.b a0, a0 +; RV64XTHEADBB-NEXT: ret + %icmp = icmp sgt i64 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define signext i32 @sextb_i32(i32 signext %a) nounwind { ; RV64I-LABEL: sextb_i32: ; RV64I: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll index a04e31a19a4f..902001a376d6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll @@ -434,19 +434,12 @@ define <256 x i8> @vsadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { ret <256 x i8> %v } -; FIXME: The upper half is doing nothing. - define <256 x i8> @vsadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { ; CHECK-LABEL: vsadd_vi_v258i8_evl128: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vlm.v v24, (a0) ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t ; CHECK-NEXT: ret %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128) ret <256 x i8> %v @@ -1418,13 +1411,8 @@ define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vsadd_vx_v32i64_evl12: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; CHECK-NEXT: vsadd.vi v8, v8, -1, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; CHECK-NEXT: vsadd.vi v16, v16, -1, v0.t ; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll index 5556b11e9a90..57292147a014 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll @@ -430,19 +430,12 @@ define <256 x i8> @vsaddu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { ret <256 x i8> %v } -; FIXME: The upper half is doing nothing. - define <256 x i8> @vsaddu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { ; CHECK-LABEL: vsaddu_vi_v258i8_evl128: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vlm.v v24, (a0) ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t ; CHECK-NEXT: ret %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128) ret <256 x i8> %v @@ -1414,13 +1407,8 @@ define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vsaddu_vx_v32i64_evl12: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; CHECK-NEXT: vsaddu.vi v8, v8, -1, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; CHECK-NEXT: vsaddu.vi v16, v16, -1, v0.t ; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll index c28317bf1426..353042fc889e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll @@ -449,20 +449,13 @@ define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { ret <256 x i8> %v } -; FIXME: The upper half is doing nothing. - define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { ; CHECK-LABEL: vssub_vi_v258i8_evl128: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vlm.v v24, (a0) ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: li a1, -1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vssub.vx v8, v8, a1, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vssub.vx v16, v16, a1, v0.t ; CHECK-NEXT: ret %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128) ret <256 x i8> %v @@ -1460,14 +1453,9 @@ define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vssub_vx_v32i64_evl12: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; CHECK-NEXT: vssub.vx v8, v8, a0, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; CHECK-NEXT: vssub.vx v16, v16, a0, v0.t ; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll index cbfe1292877e..c00fb329b2f0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll @@ -444,20 +444,13 @@ define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) { ret <256 x i8> %v } -; FIXME: The upper half is doing nothing. - define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) { ; CHECK-LABEL: vssubu_vi_v258i8_evl128: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vlm.v v24, (a0) ; CHECK-NEXT: li a0, 128 ; CHECK-NEXT: li a1, -1 ; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma ; CHECK-NEXT: vssubu.vx v8, v8, a1, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e8, m8, ta, ma -; CHECK-NEXT: vssubu.vx v16, v16, a1, v0.t ; CHECK-NEXT: ret %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> splat (i8 -1), <256 x i1> %m, i32 128) ret <256 x i8> %v @@ -1455,14 +1448,9 @@ define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) { define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) { ; CHECK-LABEL: vssubu_vx_v32i64_evl12: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma -; CHECK-NEXT: vslidedown.vi v24, v0, 2 ; CHECK-NEXT: li a0, -1 ; CHECK-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; CHECK-NEXT: vssubu.vx v8, v8, a0, v0.t -; CHECK-NEXT: vmv1r.v v0, v24 -; CHECK-NEXT: vsetivli zero, 0, e64, m8, ta, ma -; CHECK-NEXT: vssubu.vx v16, v16, a0, v0.t ; CHECK-NEXT: ret %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> splat (i64 -1), <32 x i1> %m, i32 12) ret <32 x i64> %v diff --git a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll index 481bfdd66643..fc3d8fe54602 100644 --- a/llvm/test/CodeGen/RISCV/xqcibm-extract.ll +++ b/llvm/test/CodeGen/RISCV/xqcibm-extract.ll @@ -47,6 +47,33 @@ define i32 @sexti1_i32_2(i32 %a) { ret i32 %shr } +; Make sure we don't use not+qc.ext +define zeroext i8 @sexti1_i32_setcc(i32 signext %a) { +; RV32I-LABEL: sexti1_i32_setcc: +; RV32I: # %bb.0: +; RV32I-NEXT: srli a0, a0, 31 +; RV32I-NEXT: addi a0, a0, -1 +; RV32I-NEXT: zext.b a0, a0 +; RV32I-NEXT: ret +; +; RV32XQCIBM-LABEL: sexti1_i32_setcc: +; RV32XQCIBM: # %bb.0: +; RV32XQCIBM-NEXT: srli a0, a0, 31 +; RV32XQCIBM-NEXT: addi a0, a0, -1 +; RV32XQCIBM-NEXT: qc.extu a0, a0, 8, 0 +; RV32XQCIBM-NEXT: ret +; +; RV32XQCIBMZBB-LABEL: sexti1_i32_setcc: +; RV32XQCIBMZBB: # %bb.0: +; RV32XQCIBMZBB-NEXT: srli a0, a0, 31 +; RV32XQCIBMZBB-NEXT: addi a0, a0, -1 +; RV32XQCIBMZBB-NEXT: qc.extu a0, a0, 8, 0 +; RV32XQCIBMZBB-NEXT: ret + %icmp = icmp sgt i32 %a, -1 + %sext = sext i1 %icmp to i8 + ret i8 %sext +} + define i32 @sexti8_i32(i8 %a) nounwind { ; RV32I-LABEL: sexti8_i32: diff --git a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll index 8030438645f8..4357dc5631eb 100644 --- a/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll +++ b/llvm/test/CodeGen/WebAssembly/memcmp-expand.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s target triple = "wasm32-unknown-unknown" @@ -127,24 +127,15 @@ define i1 @memcmp_expand_8(ptr %a, ptr %b) { ret i1 %res } -; TODO: Should be using a single load i64x2 or equivalent in bitsizes define i1 @memcmp_expand_16(ptr %a, ptr %b) { ; CHECK-LABEL: memcmp_expand_16: ; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32) ; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0 -; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0 -; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6 -; CHECK-NEXT: i32.const $push0=, 8 -; CHECK-NEXT: i32.add $push3=, $0, $pop0 -; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0 -; CHECK-NEXT: i32.const $push11=, 8 -; CHECK-NEXT: i32.add $push1=, $1, $pop11 -; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0 -; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2 -; CHECK-NEXT: i64.or $push9=, $pop8, $pop5 -; CHECK-NEXT: i64.eqz $push10=, $pop9 -; CHECK-NEXT: return $pop10 +; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0 +; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0 +; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: i8x16.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16) %res = icmp eq i32 %cmp_16, 0 ret i1 %res diff --git a/llvm/test/CodeGen/WebAssembly/simd-setcc.ll b/llvm/test/CodeGen/WebAssembly/simd-setcc.ll new file mode 100644 index 000000000000..127fd4e96303 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/simd-setcc.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s + +target triple = "wasm32-unknown-unknown" + +declare i32 @memcmp(ptr, ptr, i32) + +define i1 @setcc_load(ptr %a, ptr %b) { +; CHECK-LABEL: setcc_load: +; CHECK: .functype setcc_load (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.load $push1=, 0($0):p2align=0 +; CHECK-NEXT: v128.load $push0=, 0($1):p2align=0 +; CHECK-NEXT: i8x16.eq $push2=, $pop1, $pop0 +; CHECK-NEXT: i8x16.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16) + %res = icmp eq i32 %cmp_16, 0 + ret i1 %res +} + +; INFO: Negative test: noimplicitfloat disables simd +define i1 @setcc_load_should_not_vectorize(ptr %a, ptr %b) noimplicitfloat { +; CHECK-LABEL: setcc_load_should_not_vectorize: +; CHECK: .functype setcc_load_should_not_vectorize (i32, i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push4=, 0($0):p2align=0 +; CHECK-NEXT: i64.load $push3=, 0($1):p2align=0 +; CHECK-NEXT: i64.xor $push5=, $pop4, $pop3 +; CHECK-NEXT: i64.load $push1=, 8($0):p2align=0 +; CHECK-NEXT: i64.load $push0=, 8($1):p2align=0 +; CHECK-NEXT: i64.xor $push2=, $pop1, $pop0 +; CHECK-NEXT: i64.or $push6=, $pop5, $pop2 +; CHECK-NEXT: i64.eqz $push7=, $pop6 +; CHECK-NEXT: return $pop7 + %cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16) + %res = icmp eq i32 %cmp_16, 0 + ret i1 %res +} + +define i1 @setcc_eq_const_i128(ptr %ptr) { +; CHECK-LABEL: setcc_eq_const_i128: +; CHECK: .functype setcc_eq_const_i128 (i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.load $push0=, 0($0) +; CHECK-NEXT: v128.const $push1=, 6, 0 +; CHECK-NEXT: i8x16.eq $push2=, $pop0, $pop1 +; CHECK-NEXT: i8x16.all_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %l = load i128, ptr %ptr + %res = icmp eq i128 %l, 6 + ret i1 %res +} + +define i1 @setcc_ne_const_i128(ptr %ptr) { +; CHECK-LABEL: setcc_ne_const_i128: +; CHECK: .functype setcc_ne_const_i128 (i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: v128.load $push0=, 0($0) +; CHECK-NEXT: v128.const $push1=, 16, 0 +; CHECK-NEXT: i8x16.ne $push2=, $pop0, $pop1 +; CHECK-NEXT: v128.any_true $push3=, $pop2 +; CHECK-NEXT: return $pop3 + %l = load i128, ptr %ptr + %res = icmp ne i128 %l, 16 + ret i1 %res +} + +; INFO: Negative test: only eq and ne works +define i1 @setcc_slt_const_i128(ptr %ptr) { +; CHECK-LABEL: setcc_slt_const_i128: +; CHECK: .functype setcc_slt_const_i128 (i32) -> (i32) +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i64.load $push2=, 0($0) +; CHECK-NEXT: i64.const $push3=, 25 +; CHECK-NEXT: i64.lt_u $push4=, $pop2, $pop3 +; CHECK-NEXT: i64.load $push8=, 8($0) +; CHECK-NEXT: local.tee $push7=, $1=, $pop8 +; CHECK-NEXT: i64.const $push0=, 0 +; CHECK-NEXT: i64.lt_s $push1=, $pop7, $pop0 +; CHECK-NEXT: i64.eqz $push5=, $1 +; CHECK-NEXT: i32.select $push6=, $pop4, $pop1, $pop5 +; CHECK-NEXT: return $pop6 + %l = load i128, ptr %ptr + %res = icmp slt i128 %l, 25 + ret i1 %res +} diff --git a/llvm/test/CodeGen/X86/bitcnt-false-dep.ll b/llvm/test/CodeGen/X86/bitcnt-false-dep.ll index 5f576c858628..793cbb8f75bd 100644 --- a/llvm/test/CodeGen/X86/bitcnt-false-dep.ll +++ b/llvm/test/CodeGen/X86/bitcnt-false-dep.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=haswell | FileCheck %s --check-prefix=HSW ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake | FileCheck %s --check-prefix=SKL ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skx | FileCheck %s --check-prefix=SKL +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=alderlake | FileCheck %s --check-prefix=ADL ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=silvermont -mattr=+lzcnt,+bmi | FileCheck %s --check-prefix=SKL ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=goldmont -mattr=+lzcnt,+bmi | FileCheck %s --check-prefix=SKL @@ -37,6 +38,10 @@ ret: ;SKL-LABEL:@loopdep_popcnt32 ;SKL: xorl [[GPR0:%e[a-d]x]], [[GPR0]] ;SKL-NEXT: popcntl {{.*}}, [[GPR0]] + +;ADL-LABEL:@loopdep_popcnt32 +;ADL-NOT: xor +;ADL: popcntl } define i64 @loopdep_popcnt64(ptr nocapture %x, ptr nocapture %y) nounwind { @@ -63,6 +68,10 @@ ret: ;SKL-LABEL:@loopdep_popcnt64 ;SKL: xorl %e[[GPR0:[a-d]x]], %e[[GPR0]] ;SKL-NEXT: popcntq {{.*}}, %r[[GPR0]] + +;ADL-LABEL:@loopdep_popcnt64 +;ADL-NOT: xor +;ADL: popcntq } define i32 @loopdep_tzct32(ptr nocapture %x, ptr nocapture %y) nounwind { diff --git a/llvm/test/CodeGen/X86/pr152150.ll b/llvm/test/CodeGen/X86/pr152150.ll new file mode 100644 index 000000000000..6db3e555028c --- /dev/null +++ b/llvm/test/CodeGen/X86/pr152150.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown-eabi-elf | FileCheck %s + +; CHECK-LABEL: conv2d +define dso_local void @conv2d() { +.preheader: + br label %0 + +0: ; preds = %0, %.preheader + %1 = phi [4 x <7 x half>] [ zeroinitializer, %.preheader ], [ %4, %0 ] + %2 = extractvalue [4 x <7 x half>] %1, 0 + %3 = extractvalue [4 x <7 x half>] %1, 1 + %4 = insertvalue [4 x <7 x half>] poison, <7 x half> poison, 3 + br label %0 +} diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll index d28836d56037..2f5cc373a68f 100644 --- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll +++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll @@ -1,5 +1,5 @@ ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o -; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s +; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=RELOCS-BOTH %s ; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s ; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s @@ -16,10 +16,8 @@ ; RELOCS-ENRL-NEXT: 0x18 R_LARCH_RELAX - 0x0 ; RELOCS-BOTH-NEXT: } ; RELOCS-BOTH: Section ({{.*}}) .rela.debug_frame { -; RELOCS-NORL-NEXT: 0x1C R_LARCH_32 .debug_frame 0x0 -; RELOCS-NORL-NEXT: 0x20 R_LARCH_64 .text 0x0 -; RELOCS-ENRL-NEXT: 0x1C R_LARCH_32 .L0 0x0 -; RELOCS-ENRL-NEXT: 0x20 R_LARCH_64 .L0 0x0 +; RELOCS-BOTH-NEXT: 0x1C R_LARCH_32 .L0 0x0 +; RELOCS-BOTH-NEXT: 0x20 R_LARCH_64 .L0 0x0 ; RELOCS-ENRL-NEXT: 0x28 R_LARCH_ADD64 .L0 0x0 ; RELOCS-ENRL-NEXT: 0x28 R_LARCH_SUB64 .L0 0x0 ; RELOCS-ENRL-NEXT: 0x3F R_LARCH_ADD6 .L0 0x0 @@ -29,8 +27,7 @@ ; RELOCS-BOTH-NEXT: 0x22 R_LARCH_32 .debug_line_str 0x0 ; RELOCS-BOTH-NEXT: 0x31 R_LARCH_32 .debug_line_str 0x2 ; RELOCS-BOTH-NEXT: 0x46 R_LARCH_32 .debug_line_str 0x1B -; RELOCS-NORL-NEXT: 0x4F R_LARCH_64 .text 0x0 -; RELOCS-ENRL-NEXT: 0x4F R_LARCH_64 .L0 0x0 +; RELOCS-BOTH-NEXT: 0x4F R_LARCH_64 .L0 0x0 ; RELOCS-ENRL-NEXT: 0x5F R_LARCH_ADD16 .L0 0x0 ; RELOCS-ENRL-NEXT: 0x5F R_LARCH_SUB16 .L0 0x0 ; RELOCS-BOTH-NEXT: } diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll index f916130fe53e..29269ff33377 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx2-intrinsics-x86.ll @@ -140,11 +140,20 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i1> [[TMP12]], [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = and <16 x i1> [[TMP4]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = or <16 x i1> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <16 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <16 x i1> [[TMP17]] to <16 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <16 x i16> [[TMP7]] to <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32> +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]]) ; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; @@ -677,11 +686,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16> -; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = and <32 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = and <32 x i1> [[TMP12]], [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = and <32 x i1> [[TMP4]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = or <32 x i1> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <32 x i8> [[TMP7]] to <16 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16> +; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]]) ; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i16> [[RES]] ; @@ -706,11 +724,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) # ; CHECK-NEXT: [[TMP6:%.*]] = xor i64 [[TMP5]], 87960930222080 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16> -; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP19:%.*]] = and <32 x i1> [[TMP17]], [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = and <32 x i1> [[TMP9]], [[TMP18]] +; CHECK-NEXT: [[TMP21:%.*]] = or <32 x i1> [[TMP16]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = or <32 x i1> [[TMP21]], [[TMP20]] +; CHECK-NEXT: [[TMP12:%.*]] = sext <32 x i1> [[TMP22]] to <32 x i8> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast <32 x i8> [[TMP12]] to <16 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <16 x i16> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP24]] to <16 x i16> +; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]]) ; CHECK-NEXT: store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i16> [[RES]] ; @@ -740,8 +767,9 @@ define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1]]) ; CHECK-NEXT: store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <32 x i8> [[RES]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll index 02df9c49a010..abbbb040edf1 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics-upgrade.ll @@ -22,7 +22,6 @@ ; - llvm.x86.avx512.pavg.b.512, llvm.x86.avx512.pavg.w.512 ; - llvm.x86.avx512.permvar.hi.512 ; - llvm.x86.avx512.pmul.hr.sw.512, llvm.x86.avx512.pmulhu.w.512, llvm.x86.avx512.pmulh.w.512 -; - llvm.x86.avx512.pshuf.b.512 ; - llvm.x86.avx512.psllv.w.512, llvm.x86.avx512.psrav.w.512, llvm.x86.avx512.psrlv.w.512 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" @@ -1968,8 +1967,9 @@ define <64 x i8> @test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, ; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1]]) ; CHECK-NEXT: store <64 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <64 x i8> [[TMP3]] ; @@ -1984,8 +1984,9 @@ define <64 x i8> @test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP13]] +; CHECK-NEXT: [[TMP5:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1]]) ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64 [[TMP3]] to <64 x i1> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64 [[X3:%.*]] to <64 x i1> ; CHECK-NEXT: [[TMP8:%.*]] = select <64 x i1> [[TMP7]], <64 x i8> [[_MSPROP]], <64 x i8> [[TMP4]] diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll index 78c272c7b2c5..00337da67af1 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512bw-intrinsics.ll @@ -17,7 +17,6 @@ ; - llvm.x86.avx512.pavg.b.512, llvm.x86.avx512.pavg.w.512 ; - llvm.x86.avx512.permvar.hi.512 ; - llvm.x86.avx512.pmul.hr.sw.512, llvm.x86.avx512.pmulhu.w.512, llvm.x86.avx512.pmulh.w.512 -; - llvm.x86.avx512.pshuf.b.512 ; - llvm.x86.avx512.psllv.w.512 ; - llvm.x86.avx512.psrav.w.512, llvm.x86.avx512.psrlv.w.512 @@ -1714,8 +1713,9 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) # ; CHECK-NEXT: [[TMP1:%.*]] = load <64 x i8>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[RES:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1]]) ; CHECK-NEXT: store <64 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <64 x i8> [[RES]] ; @@ -1730,8 +1730,9 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512_mask(<64 x i8> %x0, <64 x i8> % ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 192) to ptr), align 8 ; CHECK-NEXT: [[TMP4:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[RES:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP10]] +; CHECK-NEXT: [[RES:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1]]) ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP3]] to <64 x i1> ; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> ; CHECK-NEXT: [[TMP6:%.*]] = select <64 x i1> [[MASK_CAST]], <64 x i8> [[_MSPROP]], <64 x i8> [[TMP4]] @@ -1755,8 +1756,9 @@ define <64 x i8>@test_int_x86_avx512_pshuf_b_512_maskz(<64 x i8> %x0, <64 x i8> ; CHECK-NEXT: [[TMP2:%.*]] = load <64 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 64) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 128) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[RES:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[TMP1]], <64 x i8> [[X1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <64 x i8> [[TMP2]], [[TMP9]] +; CHECK-NEXT: [[RES:%.*]] = call <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8> [[X0:%.*]], <64 x i8> [[X1]]) ; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64 [[TMP3]] to <64 x i1> ; CHECK-NEXT: [[MASK_CAST:%.*]] = bitcast i64 [[MASK:%.*]] to <64 x i1> ; CHECK-NEXT: [[TMP5:%.*]] = select <64 x i1> [[MASK_CAST]], <64 x i8> [[_MSPROP]], <64 x i8> zeroinitializer diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll index ac3bb5671903..3d98f60a8242 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/mmx-intrinsics.ll @@ -4,8 +4,6 @@ ; Handled strictly: ; - i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) #2 ; - void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) #2 -; - <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %4, i8 3) #5 -; - <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %4, i8 3) #5 ; - <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %4) #5 ; - <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) #5 ; - <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) #5 @@ -1687,16 +1685,30 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = icmp ne <4 x i16> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[TMP35:%.*]] = and <4 x i1> [[TMP22]], [[TMP30]] +; CHECK-NEXT: [[TMP36:%.*]] = and <4 x i1> [[TMP29]], [[TMP32]] +; CHECK-NEXT: [[TMP37:%.*]] = or <4 x i1> [[TMP31]], [[TMP35]] +; CHECK-NEXT: [[TMP38:%.*]] = or <4 x i1> [[TMP37]], [[TMP36]] +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i1> [[TMP38]] to <4 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i16> [[TMP23]] to <2 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = sext <2 x i1> [[TMP25]] to <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <2 x i32> [[TMP27]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP28]] to <1 x i64> +; CHECK-NEXT: [[TMP33:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[TMP34:%.*]] = bitcast <1 x i64> [[TMP33]] to <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP34]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2778,19 +2790,17 @@ define i64 @test21(<1 x i64> %a) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP9:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP10]], i8 3) +; CHECK-NEXT: [[TMP13:%.*]] = or <1 x i64> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP12]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i64 [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <1 x i64> [[TMP14]], i32 0 +; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i64 [[TMP15]] ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> @@ -2812,19 +2822,17 @@ define i32 @test21_2(<1 x i64> %a) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP6:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] -; CHECK: 6: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: 7: -; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP9:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP10]], i8 3) +; CHECK-NEXT: [[TMP13:%.*]] = or <1 x i64> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP6:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP12]] to <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP14]], i32 0 +; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[TMP15]] ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> @@ -3235,7 +3243,8 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64> ; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP16]], <1 x i64> [[TMP17]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP8]], [[TMP20]] ; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8> @@ -3315,16 +3324,30 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64> ; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> -; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +; CHECK-NEXT: [[TMP32:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP35:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP37:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = and <8 x i1> [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[TMP38:%.*]] = and <8 x i1> [[TMP35]], [[TMP33]] +; CHECK-NEXT: [[TMP39:%.*]] = and <8 x i1> [[TMP32]], [[TMP37]] +; CHECK-NEXT: [[TMP40:%.*]] = or <8 x i1> [[TMP34]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]] +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8> +; CHECK-NEXT: [[TMP26:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <4 x i16> [[TMP26]], zeroinitializer +; CHECK-NEXT: [[TMP29:%.*]] = sext <4 x i1> [[TMP25]] to <4 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <4 x i16> [[TMP29]] to i64 +; CHECK-NEXT: [[TMP30:%.*]] = bitcast i64 [[TMP24]] to <1 x i64> +; CHECK-NEXT: [[TMP36:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <1 x i64> [[TMP30]] to <8 x i8> +; CHECK-NEXT: [[TMP28:%.*]] = bitcast <1 x i64> [[TMP36]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP31]] to <1 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP28]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll index 8f915a59db8e..704805018079 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/X86/sse2-intrinsics-x86.ll @@ -762,11 +762,20 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP12]], [[TMP5]] +; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i1> [[TMP4]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP11]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP16]], [[TMP15]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i1> [[TMP17]] to <8 x i16> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i16> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]]) ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll index 5cc56baf0e0d..cd79bcb2233f 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/avx2-intrinsics-i386.ll @@ -149,11 +149,20 @@ define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <16 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = or <16 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i16> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP5]] to <8 x i32> -; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0:%.*]], <16 x i16> [[A1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <16 x i16> [[A0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <16 x i16> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i1> [[TMP5]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = and <16 x i1> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i1> [[TMP5]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <16 x i1> [[TMP13]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <16 x i1> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP8:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i16> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <16 x i16> [[TMP8]] to <8 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i32> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i1> [[TMP21]] to <8 x i32> +; CHECK-NEXT: [[RES:%.*]] = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> [[A0]], <16 x i16> [[A1]]) ; CHECK-NEXT: store <8 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <8 x i32> [[RES]] ; @@ -714,11 +723,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <32 x i8> [[TMP3]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP5]] to <16 x i16> -; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <32 x i8> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <32 x i8> [[A0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <32 x i1> [[TMP5]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = and <32 x i1> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = and <32 x i1> [[TMP5]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <32 x i1> [[TMP13]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <32 x i1> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP8:%.*]] = sext <32 x i1> [[TMP19]] to <32 x i8> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16> +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <16 x i16> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <16 x i1> [[TMP21]] to <16 x i16> +; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]]) ; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i16> [[RES]] ; @@ -734,7 +752,7 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) # ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() ; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP12:%.*]], !prof [[PROF1]] +; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP4:%.*]], label [[TMP8:%.*]], !prof [[PROF1]] ; CHECK: 4: ; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] ; CHECK-NEXT: unreachable @@ -744,11 +762,20 @@ define <16 x i16> @test_x86_avx2_pmadd_ub_sw_load_op0(ptr %ptr, <32 x i8> %a1) # ; CHECK-NEXT: [[TMP6:%.*]] = and i64 [[TMP5]], -2147483649 ; CHECK-NEXT: [[TMP7:%.*]] = inttoptr i64 [[TMP6]] to ptr ; CHECK-NEXT: [[_MSLD:%.*]] = load <32 x i8>, ptr [[TMP7]], align 32 -; CHECK-NEXT: [[TMP8:%.*]] = or <32 x i8> [[_MSLD]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <32 x i8> [[TMP8]] to <16 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP10]] to <16 x i16> -; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <32 x i8> [[_MSLD]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <32 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne <32 x i8> [[A0]], zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne <32 x i8> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = and <32 x i1> [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = and <32 x i1> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP21:%.*]] = and <32 x i1> [[TMP10]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = or <32 x i1> [[TMP12]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = or <32 x i1> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP13:%.*]] = sext <32 x i1> [[TMP23]] to <32 x i8> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast <32 x i8> [[TMP13]] to <16 x i16> +; CHECK-NEXT: [[TMP25:%.*]] = icmp ne <16 x i16> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = sext <16 x i1> [[TMP25]] to <16 x i16> +; CHECK-NEXT: [[RES:%.*]] = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> [[A0]], <32 x i8> [[A1]]) ; CHECK-NEXT: store <16 x i16> [[TMP11]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <16 x i16> [[RES]] ; @@ -780,8 +807,9 @@ define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 32) to ptr), align 8 ; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[TMP1]], <32 x i8> [[A1:%.*]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <32 x i8> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[RES:%.*]] = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> [[A0:%.*]], <32 x i8> [[A1]]) ; CHECK-NEXT: store <32 x i8> [[_MSPROP]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <32 x i8> [[RES]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll index 0a3efaaea149..8052b5e34526 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/mmx-intrinsics.ll @@ -4,8 +4,6 @@ ; Handled strictly: ; - i32 @llvm.x86.mmx.pmovmskb(<1 x i64> %mmx_var.i) #2 ; - void @llvm.x86.mmx.maskmovq(<1 x i64> %mmx_var.i, <1 x i64> %mmx_var1.i, ptr %p) #2 -; - <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %4, i8 3) #5 -; - <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> %4, i8 3) #5 ; - <2 x double> @llvm.x86.sse.cvtpi2pd(<1 x i64> %4) #5 ; - <1 x i64> @llvm.x86.sse.cvttpd2pi(<2 x double> %a) #5 ; - <1 x i64> @llvm.x86.sse.cvtpd2pi(<2 x double> %a) #5 @@ -1730,16 +1728,30 @@ define i64 @test49(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[MMX_VAR_I:%.*]] = bitcast <4 x i16> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x i16> [[TMP16]] to <1 x i64> ; CHECK-NEXT: [[MMX_VAR1_I:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP8:%.*]] = or <1 x i64> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP8]] to <2 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <2 x i32> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = sext <2 x i1> [[TMP10]] to <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast <2 x i32> [[TMP11]] to <1 x i64> -; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[MMX_VAR_I]] to <4 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <1 x i64> [[MMX_VAR1_I]] to <4 x i16> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP6]] to <4 x i16> +; CHECK-NEXT: [[TMP22:%.*]] = bitcast <1 x i64> [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP31:%.*]] = icmp ne <4 x i16> [[TMP22]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = icmp ne <4 x i16> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <4 x i16> [[TMP10]], zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[TMP36:%.*]] = and <4 x i1> [[TMP23]], [[TMP31]] +; CHECK-NEXT: [[TMP37:%.*]] = and <4 x i1> [[TMP30]], [[TMP33]] +; CHECK-NEXT: [[TMP38:%.*]] = or <4 x i1> [[TMP32]], [[TMP36]] +; CHECK-NEXT: [[TMP39:%.*]] = or <4 x i1> [[TMP38]], [[TMP37]] +; CHECK-NEXT: [[TMP24:%.*]] = sext <4 x i1> [[TMP39]] to <4 x i16> +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <4 x i16> [[TMP24]] to <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <2 x i32> [[TMP25]], zeroinitializer +; CHECK-NEXT: [[TMP28:%.*]] = sext <2 x i1> [[TMP26]] to <2 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = bitcast <2 x i32> [[TMP28]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64 [[TMP29]] to <1 x i64> +; CHECK-NEXT: [[TMP34:%.*]] = tail call <1 x i64> @llvm.x86.mmx.pmadd.wd(<1 x i64> [[MMX_VAR_I]], <1 x i64> [[MMX_VAR1_I]]) #[[ATTR2]] ; CHECK-NEXT: [[TMP20:%.*]] = bitcast <1 x i64> [[TMP14]] to <2 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP3]] to <1 x i64> -; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[TMP35:%.*]] = bitcast <1 x i64> [[TMP34]] to <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i32> [[TMP20]] to <1 x i64> +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x i32> [[TMP35]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP17]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <1 x i64> [[TMP21]], i32 0 ; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 @@ -2849,19 +2861,17 @@ define i64 @test21(<1 x i64> %a) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP10]], i8 3) +; CHECK-NEXT: [[TMP13:%.*]] = or <1 x i64> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP14]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <1 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP9]] to <1 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <1 x i64> [[TMP4]], i32 0 -; CHECK-NEXT: store i64 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i64 [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <1 x i64> [[TMP15]], i32 0 +; CHECK-NEXT: store i64 [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i64 [[TMP12]] ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> @@ -2884,19 +2894,17 @@ define i32 @test21_2(<1 x i64> %a) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i64> [[A]] to <4 x i16> ; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i16> [[TMP8]] to <1 x i64> ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i16> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP10]] to i64 -; CHECK-NEXT: [[_MSCMP:%.*]] = icmp ne i64 [[TMP9]], 0 -; CHECK-NEXT: br i1 [[_MSCMP]], label [[TMP12:%.*]], label [[TMP14:%.*]], !prof [[PROF1]] -; CHECK: 7: -; CHECK-NEXT: call void @__msan_warning_noreturn() #[[ATTR6]] -; CHECK-NEXT: unreachable -; CHECK: 8: -; CHECK-NEXT: [[TMP13:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] +; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP10]], i8 3) +; CHECK-NEXT: [[TMP13:%.*]] = or <1 x i64> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = tail call <1 x i64> @llvm.x86.sse.pshuf.w(<1 x i64> [[TMP11]], i8 3) #[[ATTR5]] ; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP13]] to <4 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <1 x i64> [[TMP14]] to <4 x i16> ; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[TMP3]] to <2 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i16> [[TMP9]] to <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 0, ptr @__msan_retval_tls, align 8 -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 +; CHECK-NEXT: store i32 [[TMP5]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret i32 [[TMP12]] ; entry: %0 = bitcast <1 x i64> %a to <4 x i16> @@ -3319,7 +3327,8 @@ define i64 @test9(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP12]] to <1 x i64> ; CHECK-NEXT: [[TMP17:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP16]], [[TMP8]] +; CHECK-NEXT: [[TMP20:%.*]] = call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP16]], <1 x i64> [[TMP17]]) +; CHECK-NEXT: [[_MSPROP:%.*]] = or <1 x i64> [[TMP8]], [[TMP20]] ; CHECK-NEXT: [[TMP18:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pshuf.b(<1 x i64> [[TMP2]], <1 x i64> [[TMP17]]) #[[ATTR5]] ; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[_MSPROP]] to <8 x i8> ; CHECK-NEXT: [[TMP19:%.*]] = bitcast <1 x i64> [[TMP18]] to <8 x i8> @@ -3401,16 +3410,30 @@ define i64 @test7(<1 x i64> %a, <1 x i64> %b) #0 { ; CHECK-NEXT: [[TMP22:%.*]] = bitcast <8 x i8> [[TMP1]] to <1 x i64> ; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i8> [[TMP17]] to <1 x i64> ; CHECK-NEXT: [[TMP23:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> -; CHECK-NEXT: [[TMP10:%.*]] = or <1 x i64> [[TMP21]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP10]] to <4 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i16> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = sext <4 x i1> [[TMP12]] to <4 x i16> -; CHECK-NEXT: [[TMP14:%.*]] = bitcast <4 x i16> [[TMP13]] to <1 x i64> -; CHECK-NEXT: [[TMP24:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]] -; CHECK-NEXT: [[TMP16:%.*]] = bitcast <1 x i64> [[TMP14]] to <8 x i8> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP24]] to <8 x i8> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP16]] to <1 x i64> -; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast <1 x i64> [[TMP22]] to <8 x i8> +; CHECK-NEXT: [[TMP12:%.*]] = bitcast <1 x i64> [[TMP23]] to <8 x i8> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <1 x i64> [[TMP21]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <1 x i64> [[TMP8]] to <8 x i8> +; CHECK-NEXT: [[TMP33:%.*]] = icmp ne <8 x i8> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP34:%.*]] = icmp ne <8 x i8> [[TMP14]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; CHECK-NEXT: [[TMP36:%.*]] = icmp ne <8 x i8> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP35:%.*]] = and <8 x i1> [[TMP33]], [[TMP34]] +; CHECK-NEXT: [[TMP38:%.*]] = and <8 x i1> [[TMP26]], [[TMP34]] +; CHECK-NEXT: [[TMP39:%.*]] = and <8 x i1> [[TMP33]], [[TMP36]] +; CHECK-NEXT: [[TMP40:%.*]] = or <8 x i1> [[TMP35]], [[TMP38]] +; CHECK-NEXT: [[TMP41:%.*]] = or <8 x i1> [[TMP40]], [[TMP39]] +; CHECK-NEXT: [[TMP16:%.*]] = sext <8 x i1> [[TMP41]] to <8 x i8> +; CHECK-NEXT: [[TMP27:%.*]] = bitcast <8 x i8> [[TMP16]] to <4 x i16> +; CHECK-NEXT: [[TMP28:%.*]] = icmp ne <4 x i16> [[TMP27]], zeroinitializer +; CHECK-NEXT: [[TMP42:%.*]] = sext <4 x i1> [[TMP28]] to <4 x i16> +; CHECK-NEXT: [[TMP30:%.*]] = bitcast <4 x i16> [[TMP42]] to i64 +; CHECK-NEXT: [[TMP32:%.*]] = bitcast i64 [[TMP30]] to <1 x i64> +; CHECK-NEXT: [[TMP31:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[TMP22]], <1 x i64> [[TMP23]]) #[[ATTR5]] +; CHECK-NEXT: [[TMP25:%.*]] = bitcast <1 x i64> [[TMP32]] to <8 x i8> +; CHECK-NEXT: [[TMP37:%.*]] = bitcast <1 x i64> [[TMP31]] to <8 x i8> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <8 x i8> [[TMP25]] to <1 x i64> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <8 x i8> [[TMP37]] to <1 x i64> ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <1 x i64> [[TMP6]], i32 0 ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <1 x i64> [[TMP19]], i32 0 ; CHECK-NEXT: store i64 [[TMP7]], ptr @__msan_retval_tls, align 8 diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll index e771e60e2f29..3a37eafd78ec 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/i386/sse2-intrinsics-i386.ll @@ -800,11 +800,20 @@ define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i64, ptr @__msan_va_arg_overflow_size_tls, align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP3:%.*]] = or <8 x i16> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i16> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP5]] to <4 x i32> -; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0:%.*]], <8 x i16> [[A1:%.*]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i16> [[A0:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i16> [[A1:%.*]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[TMP5]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = and <8 x i1> [[TMP5]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = or <8 x i1> [[TMP13]], [[TMP16]] +; CHECK-NEXT: [[TMP19:%.*]] = or <8 x i1> [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[TMP8:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i16> +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i16> [[TMP8]] to <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <4 x i32> [[TMP20]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i1> [[TMP21]] to <4 x i32> +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A0]], <8 x i16> [[A1]]) ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[RES]] ; diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll index d614bb85d858..d1060fb33e1b 100644 --- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll +++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll @@ -17,10 +17,19 @@ define <4 x i32> @Test_sse2_pmadd_wd(<8 x i16> %a, <8 x i16> %b) sanitize_memory ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 16) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = or <8 x i16> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne <8 x i16> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <8 x i16> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <8 x i16> [[A]], zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i1> [[TMP2]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = and <8 x i1> [[TMP3]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = and <8 x i1> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP15:%.*]] = or <8 x i1> [[TMP6]], [[TMP13]] +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i1> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = sext <8 x i1> [[TMP10]] to <8 x i16> +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[TMP11]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <4 x i32> [[TMP16]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP17]] to <4 x i32> ; CHECK-NEXT: [[C:%.*]] = tail call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> [[A]], <8 x i16> [[B]]) #[[ATTR2:[0-9]+]] ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <4 x i32> [[C]] @@ -39,13 +48,27 @@ define <1 x i64> @Test_ssse3_pmadd_ub_sw(<1 x i64> %a, <1 x i64> %b) sanitize_me ; CHECK-NEXT: [[TMP0:%.*]] = load <1 x i64>, ptr @__msan_param_tls, align 8 ; CHECK-NEXT: [[TMP1:%.*]] = load <1 x i64>, ptr inttoptr (i64 add (i64 ptrtoint (ptr @__msan_param_tls to i64), i64 8) to ptr), align 8 ; CHECK-NEXT: call void @llvm.donothing() -; CHECK-NEXT: [[TMP2:%.*]] = or <1 x i64> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[TMP2]] to <4 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <4 x i16> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = sext <4 x i1> [[TMP4]] to <4 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = bitcast <4 x i16> [[TMP5]] to <1 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[A]] to <8 x i8> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <1 x i64> [[B]] to <8 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP0]] to <8 x i8> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <1 x i64> [[TMP1]] to <8 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne <8 x i8> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <8 x i8> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP16:%.*]] = and <8 x i1> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP11:%.*]] = and <8 x i1> [[TMP17]], [[TMP15]] +; CHECK-NEXT: [[TMP12:%.*]] = and <8 x i1> [[TMP14]], [[TMP21]] +; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP16]], [[TMP11]] +; CHECK-NEXT: [[TMP22:%.*]] = or <8 x i1> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP7:%.*]] = sext <8 x i1> [[TMP22]] to <8 x i8> +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <8 x i8> [[TMP7]] to <4 x i16> +; CHECK-NEXT: [[TMP24:%.*]] = icmp ne <4 x i16> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP23:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i16> +; CHECK-NEXT: [[TMP19:%.*]] = bitcast <4 x i16> [[TMP23]] to i64 +; CHECK-NEXT: [[TMP20:%.*]] = bitcast i64 [[TMP19]] to <1 x i64> ; CHECK-NEXT: [[C:%.*]] = tail call <1 x i64> @llvm.x86.ssse3.pmadd.ub.sw(<1 x i64> [[A]], <1 x i64> [[B]]) #[[ATTR2]] -; CHECK-NEXT: store <1 x i64> [[TMP6]], ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: store <1 x i64> [[TMP20]], ptr @__msan_retval_tls, align 8 ; CHECK-NEXT: ret <1 x i64> [[C]] ; entry: diff --git a/llvm/test/MC/AArch64/arm64-aliases.s b/llvm/test/MC/AArch64/arm64-aliases.s index 3ace7a0f7183..ae157c676c95 100644 --- a/llvm/test/MC/AArch64/arm64-aliases.s +++ b/llvm/test/MC/AArch64/arm64-aliases.s @@ -512,6 +512,20 @@ foo: sys #4, c8, c3, #6 ; CHECK: tlbi vmalls12e1is +; Check that all 5 register bits are set (0x31): +; (from Arm ARM regarding TLBI instructions without operands) +; "Rt should be encoded as 0b11111. If the Rt field is not set to 0b11111, +; it is CONSTRAINED UNPREDICTABLE whether: +; * The instruction is UNDEFINED. +; * The instruction behaves as if the Rt field is set to 0b11111." +; +; Do not disassemble this to `tlbi` but a SYS alias instead +; + sys #4, c8, c7, #6, x30 +; CHECK: sys #0x4, c8, c7, #0x6, x30 + sys #4, c8, c7, #6, x31 +; CHECK: tlbi vmalls12e1 + ic ialluis ; CHECK: ic ialluis ; encoding: [0x1f,0x71,0x08,0xd5] ic iallu diff --git a/llvm/test/MC/Hexagon/system-inst.s b/llvm/test/MC/Hexagon/system-inst.s index 7bc153359853..07f7ca0acb2d 100644 --- a/llvm/test/MC/Hexagon/system-inst.s +++ b/llvm/test/MC/Hexagon/system-inst.s @@ -89,6 +89,9 @@ crswap(r12,sgp0) #CHECK: 652dc000 { crswap(r13,sgp1) } crswap(r13,sgp1) +#CHECK: 6d8ec000 { crswap(r15:14,s1:0) } +crswap(r15:14,sgp1:0) + #CHECK: 660fc00e { r14 = getimask(r15) } r14=getimask(r15) diff --git a/llvm/test/MC/LoongArch/Misc/cfi-advance.s b/llvm/test/MC/LoongArch/Misc/cfi-advance.s index 38eba7caf610..86b36a38c3f1 100644 --- a/llvm/test/MC/LoongArch/Misc/cfi-advance.s +++ b/llvm/test/MC/LoongArch/Misc/cfi-advance.s @@ -5,13 +5,13 @@ # RUN: | llvm-readobj -r - | FileCheck --check-prefix=RELAX %s # RELOC: Relocations [ -# RELOC-NEXT: .rela.eh_frame { -# RELOC-NEXT: 0x1C R_LARCH_32_PCREL .text 0x0 +# RELOC: .rela.eh_frame { +# RELOC-NEXT: 0x1C R_LARCH_32_PCREL .L{{.*}} 0x0 # RELOC-NEXT: } # RELOC-NEXT: ] -# DWARFDUMP: DW_CFA_advance_loc: 4 +# DWARFDUMP: DW_CFA_advance_loc: 8 # DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8 -# DWARFDUMP-NEXT: DW_CFA_advance_loc: 8 +# DWARFDUMP-NEXT: DW_CFA_advance_loc: 4 # DWARFDUMP-NEXT: DW_CFA_def_cfa_offset: +8 # RELAX: Relocations [ @@ -19,6 +19,8 @@ # RELAX-NEXT: 0x1C R_LARCH_32_PCREL .L{{.*}} 0x0 # RELAX-NEXT: 0x20 R_LARCH_ADD32 .L{{.*}} 0x0 # RELAX-NEXT: 0x20 R_LARCH_SUB32 .L{{.*}} 0x0 +# RELAX-NEXT: 0x25 R_LARCH_ADD6 .L{{.*}} 0x0 +# RELAX-NEXT: 0x25 R_LARCH_SUB6 .L{{.*}} 0x0 # RELAX-NEXT: 0x28 R_LARCH_ADD6 .L{{.*}} 0x0 # RELAX-NEXT: 0x28 R_LARCH_SUB6 .L{{.*}} 0x0 # RELAX-NEXT: } @@ -30,7 +32,7 @@ .type test,@function test: .cfi_startproc - nop + call36 foo .cfi_def_cfa_offset 8 .p2align 3 nop diff --git a/llvm/test/MC/LoongArch/Relocations/align-after-relax.s b/llvm/test/MC/LoongArch/Relocations/align-after-relax.s new file mode 100644 index 000000000000..199c3fcfa0af --- /dev/null +++ b/llvm/test/MC/LoongArch/Relocations/align-after-relax.s @@ -0,0 +1,57 @@ +## The file testing R_LARCH_ALIGN emitting when linker-relaxation enabled. + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t.n +# RUN: llvm-objdump -dr %t.n | FileCheck %s --check-prefix=NORELAX +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.r +# RUN: llvm-objdump -dr %t.r | FileCheck %s --check-prefix=RELAX + +# NORELAX: pcaddu18i $ra, 0 +# NORELAX-NEXT: R_LARCH_CALL36 f +# NORELAX-NEXT: jirl $ra, $ra, 0 +# NORELAX-COUNT-6: nop +# NORELAX: pcaddu18i $ra, 0 +# NORELAX-NEXT: R_LARCH_CALL36 f +# NORELAX-NEXT: jirl $ra, $ra, 0 +# NORELAX-COUNT-6: nop +# NORELAX: pcaddu18i $ra, 0 +# NORELAX-NEXT: R_LARCH_CALL36 f +# NORELAX-NEXT: jirl $ra, $ra, 0 + +# RELAX: pcaddu18i $ra, 0 +# RELAX-NEXT: R_LARCH_CALL36 f +# RELAX-NEXT: R_LARCH_RELAX *ABS* +# RELAX-NEXT: jirl $ra, $ra, 0 +# RELAX-NEXT: nop +# RELAX-NEXT: R_LARCH_ALIGN *ABS*+0x1c +# RELAX-COUNT-6: nop +# RELAX: pcaddu18i $ra, 0 +# RELAX-NEXT: R_LARCH_CALL36 f +# RELAX-NEXT: R_LARCH_RELAX *ABS* +# RELAX-NEXT: jirl $ra, $ra, 0 +# RELAX-NEXT: nop +# RELAX-NEXT: R_LARCH_ALIGN *ABS*+0x1c +# RELAX-COUNT-6: nop +# RELAX: pcaddu18i $ra, 0 +# RELAX-NEXT: R_LARCH_CALL36 f +# RELAX-NEXT: jirl $ra, $ra, 0 + +.text +## No R_LARCH_ALIGN before the first linker-relaxable instruction. +.p2align 5 +foo: + call36 f + +## R_LARCH_ALIGN is required after the first linker-relaxable instruction. +.p2align 5 +bar: + call36 f + +.option push +.option norelax +## R_LARCH_ALIGN is required even if norelax, because it is after a +## linker-relaxable instruction. No R_LARCH_RELAX for call36 because +## of the norelax. +.p2align 5 +baz: + call36 f +.option pop diff --git a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s index ab911d1853a8..3b9f4003950f 100644 --- a/llvm/test/MC/LoongArch/Relocations/fde-reloc.s +++ b/llvm/test/MC/LoongArch/Relocations/fde-reloc.s @@ -1,7 +1,7 @@ # RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax < %s \ # RUN: | llvm-readobj -r - | FileCheck %s # RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax < %s \ -# RUN: | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX +# RUN: | llvm-readobj -r - | FileCheck %s ## Ensure that the eh_frame records the symbolic difference with ## the R_LARCH_32_PCREL relocation. @@ -11,9 +11,6 @@ func: ret .cfi_endproc -# CHECK: Section (4) .rela.eh_frame { -# CHECK-NEXT: 0x1C R_LARCH_32_PCREL .text 0x0 +# CHECK: Section ({{.*}}) .rela.eh_frame { +# CHECK-NEXT: 0x1C R_LARCH_32_PCREL .L{{.*}} 0x0 # CHECK-NEXT: } -# RELAX: Section ({{.*}}) .rela.eh_frame { -# RELAX-NEXT: 0x1C R_LARCH_32_PCREL .L{{.*}} 0x0 -# RELAX-NEXT: } diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s index f2524b29d230..67c643c07689 100644 --- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s +++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s @@ -5,34 +5,37 @@ # NORELAX: Relocations [ # NORELAX-NEXT: Section ({{.*}}) .rela.text { -# NORELAX-NEXT: 0x10 R_LARCH_PCALA_HI20 .text 0x0 -# NORELAX-NEXT: 0x14 R_LARCH_PCALA_LO12 .text 0x0 +# NORELAX-NEXT: 0x0 R_LARCH_CALL36 foo 0x0 +# NORELAX-NEXT: 0x10 R_LARCH_PCALA_HI20 .L1 0x0 +# NORELAX-NEXT: 0x14 R_LARCH_PCALA_LO12 .L1 0x0 # NORELAX-NEXT: } # NORELAX-NEXT: Section ({{.*}}) .rela.data { # NORELAX-NEXT: 0x30 R_LARCH_ADD8 foo 0x0 -# NORELAX-NEXT: 0x30 R_LARCH_SUB8 .text 0x10 +# NORELAX-NEXT: 0x30 R_LARCH_SUB8 .L3 0x0 # NORELAX-NEXT: 0x31 R_LARCH_ADD16 foo 0x0 -# NORELAX-NEXT: 0x31 R_LARCH_SUB16 .text 0x10 +# NORELAX-NEXT: 0x31 R_LARCH_SUB16 .L3 0x0 # NORELAX-NEXT: 0x33 R_LARCH_ADD32 foo 0x0 -# NORELAX-NEXT: 0x33 R_LARCH_SUB32 .text 0x10 +# NORELAX-NEXT: 0x33 R_LARCH_SUB32 .L3 0x0 # NORELAX-NEXT: 0x37 R_LARCH_ADD64 foo 0x0 -# NORELAX-NEXT: 0x37 R_LARCH_SUB64 .text 0x10 +# NORELAX-NEXT: 0x37 R_LARCH_SUB64 .L3 0x0 # NORELAX-NEXT: } # NORELAX-NEXT: ] # NORELAX: Hex dump of section '.data': # NORELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000004 -# NORELAX-NEXT: 0x00000010 0c0c000c 0000000c 00000000 0000000c +# NORELAX-NEXT: 0x00000010 04040004 00000004 00000000 00000004 # NORELAX-NEXT: 0x00000020 08080008 00000008 00000000 00000008 # NORELAX-NEXT: 0x00000030 00000000 00000000 00000000 000000 # RELAX: Relocations [ # RELAX-NEXT: Section ({{.*}}) .rela.text { -# RELAX-NEXT: 0x4 R_LARCH_ALIGN - 0xC -# RELAX-NEXT: 0x10 R_LARCH_PCALA_HI20 .L1 0x0 -# RELAX-NEXT: 0x10 R_LARCH_RELAX - 0x0 -# RELAX-NEXT: 0x14 R_LARCH_PCALA_LO12 .L1 0x0 -# RELAX-NEXT: 0x14 R_LARCH_RELAX - 0x0 +# RELAX-NEXT: 0x0 R_LARCH_CALL36 foo 0x0 +# RELAX-NEXT: 0x0 R_LARCH_RELAX - 0x0 +# RELAX-NEXT: 0xC R_LARCH_ALIGN - 0xC +# RELAX-NEXT: 0x18 R_LARCH_PCALA_HI20 .L1 0x0 +# RELAX-NEXT: 0x18 R_LARCH_RELAX - 0x0 +# RELAX-NEXT: 0x1C R_LARCH_PCALA_LO12 .L1 0x0 +# RELAX-NEXT: 0x1C R_LARCH_RELAX - 0x0 # RELAX-NEXT: } # RELAX-NEXT: Section ({{.*}}) .rela.data { # RELAX-NEXT: 0x10 R_LARCH_ADD8 .L3 0x0 @@ -73,6 +76,7 @@ # RELAX-NEXT: 0x00000030 00000000 00000000 00000000 000000 .text + call36 foo .L1: nop .L2: diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align-in-subsection.s b/llvm/test/MC/LoongArch/Relocations/relax-align-in-subsection.s new file mode 100644 index 000000000000..92fceeb14fae --- /dev/null +++ b/llvm/test/MC/LoongArch/Relocations/relax-align-in-subsection.s @@ -0,0 +1,35 @@ +## The file testing R_LARCH_ALIGN emitting when linker-relaxation enabled. + +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.n +# RUN: llvm-objdump -dr %t.n | FileCheck %s --check-prefix=RELAX +# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax --defsym FILL=1 %s -o %t.f +# RUN: llvm-objdump -dr %t.f | FileCheck %s --check-prefixes=RELAX,ALIGN + +# ALIGN: nop +# ALIGN-NEXT: R_LARCH_ALIGN *ABS*+0x1c +# ALIGN-COUNT-6: nop +# RELAX: ret +# RELAX: pcaddu18i $ra, 0 +# RELAX-NEXT: R_LARCH_CALL36 f +# RELAX-NEXT: R_LARCH_RELAX *ABS* +# RELAX-NEXT: jirl $ra, $ra, 0 + +.text +.option push +.option norelax +## When FILL is defined, the order of Alignment directive in this lower-numbered +## subsection will be larger, and even larger than the section order of the first +## linker-relaxable call36 instruction. It should conservatively be treated as +## linker-relaxable even has norelax. +.ifdef FILL + .space 0 +.endif +.p2align 5 +foo: + ret +.option pop + +.text 1 + .space 0 +bar: + call36 f diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align.s b/llvm/test/MC/LoongArch/Relocations/relax-align.s index 477d5ca24ec7..e9d7f31b6839 100644 --- a/llvm/test/MC/LoongArch/Relocations/relax-align.s +++ b/llvm/test/MC/LoongArch/Relocations/relax-align.s @@ -8,8 +8,9 @@ # RUN: llvm-readobj -r %t.r | FileCheck %s --check-prefixes=RELOC,RELAX-RELOC .text -break 0 -# INSTR: break 0 +call36 foo +# INSTR: pcaddu18i $ra, 0 +# INSTR-NEXT: jirl $ra, $ra, 0 ## Not emit R_LARCH_ALIGN if alignment directive is less than or equal to ## minimum code alignment(a.k.a 4). @@ -24,8 +25,8 @@ break 0 ## The behavior is the same as GNU assembler. break 1 .p2align 4, 1 -# INSTR-NEXT: break 1 -# INSTR-COUNT-2: 01 01 01 01 +# INSTR-NEXT: break 1 +# INSTR-NEXT: 01 01 01 01 break 2 .p2align 4, 1, 12 @@ -62,20 +63,25 @@ ret ## Test the symbol index is different from .text. .section .text2, "ax" +call36 foo .p2align 4 .p2align 4, , 4 break 7 # RELOC: Relocations [ -# RELAX-RELOC-NEXT: Section ({{.*}}) .rela.text { +# RELOC-NEXT: Section ({{.*}}) .rela.text { +# RELOC-NEXT: 0x0 R_LARCH_CALL36 foo 0x0 +# RELAX-RELOC-NEXT: 0x0 R_LARCH_RELAX - 0x0 # RELAX-RELOC-NEXT: 0x24 R_LARCH_ALIGN - 0xC # RELAX-RELOC-NEXT: 0x34 R_LARCH_ALIGN - 0x1C # RELAX-RELOC-NEXT: 0x50 R_LARCH_ALIGN - 0xC # RELAX-RELOC-NEXT: 0x60 R_LARCH_ALIGN .Lla-relax-align0 0xB04 # RELAX-RELOC-NEXT: 0x70 R_LARCH_ALIGN - 0xC -# RELAX-RELOC-NEXT: } -# RELAX-RELOC-NEXT: Section ({{.*}}) .rela.text2 { -# RELAX-RELOC-NEXT: 0x0 R_LARCH_ALIGN - 0xC -# RELAX-RELOC-NEXT: 0xC R_LARCH_ALIGN .Lla-relax-align1 0x404 -# RELAX-RELOC-NEXT: } +# RELOC-NEXT: } +# RELOC-NEXT: Section ({{.*}}) .rela.text2 { +# RELOC-NEXT: 0x0 R_LARCH_CALL36 foo 0x0 +# RELAX-RELOC-NEXT: 0x0 R_LARCH_RELAX - 0x0 +# RELAX-RELOC-NEXT: 0x8 R_LARCH_ALIGN - 0xC +# RELAX-RELOC-NEXT: 0x14 R_LARCH_ALIGN .Lla-relax-align1 0x404 +# RELOC-NEXT: } # RELOC-NEXT: ] diff --git a/llvm/test/MC/LoongArch/Relocations/relax-attr.s b/llvm/test/MC/LoongArch/Relocations/relax-attr.s index b1e648d850bb..7cc8dda07e33 100644 --- a/llvm/test/MC/LoongArch/Relocations/relax-attr.s +++ b/llvm/test/MC/LoongArch/Relocations/relax-attr.s @@ -1,19 +1,24 @@ -# RUN: llvm-mc --filetype=obj --triple=loongarch64 %s -o %t -# RUN: llvm-readobj -r %t | FileCheck %s -# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s -o %t -# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=CHECKR +# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=-relax %s -o %t.n +# RUN: llvm-readobj -r %t.n | FileCheck %s +# RUN: llvm-mc --filetype=obj --triple=loongarch64 -mattr=+relax %s -o %t.r +# RUN: llvm-readobj -r %t.r | FileCheck %s --check-prefix=CHECKR # CHECK: Relocations [ +# CHECK-NEXT: Section ({{.*}}) .rela.text { +# CHECK-NEXT: 0x4 R_LARCH_CALL36 foo 0x0 +# CHECK-NEXT: } # CHECK-NEXT: Section ({{.*}}) .rela.data { -# CHECK-NEXT: 0x0 R_LARCH_64 .text 0x4 +# CHECK-NEXT: 0x0 R_LARCH_64 .L1 0x0 # CHECK-NEXT: } # CHECK-NEXT: ] # CHECKR: Relocations [ # CHECKR-NEXT: Section ({{.*}}) .rela.text { -# CHECKR-NEXT: 0x8 R_LARCH_B21 .L1 0x0 -# CHECKR-NEXT: 0xC R_LARCH_B16 .L1 0x0 -# CHECKR-NEXT: 0x10 R_LARCH_B26 .L1 0x0 +# CHECKR-NEXT: 0x4 R_LARCH_CALL36 foo 0x0 +# CHECKR-NEXT: 0x4 R_LARCH_RELAX - 0x0 +# CHECKR-NEXT: 0x10 R_LARCH_B21 .L0 0x0 +# CHECKR-NEXT: 0x18 R_LARCH_B16 .L0 0x0 +# CHECKR-NEXT: 0x20 R_LARCH_B26 .L0 0x0 # CHECKR-NEXT: } # CHECKR-NEXT: Section ({{.*}}) .rela.data { # CHECKR-NEXT: 0x0 R_LARCH_64 .L1 0x0 @@ -22,10 +27,21 @@ .text nop + +.L0: + call36 foo + .L1: nop +## Relocations for branches to .L0 must be reserved and be fixed up by linker +## when linker relaxation enabled, because of the relaxable call36 instruction. +## Branches to .L1 can be resolved correctly at compile time, so their +## relocations can simply be removed. + bnez $a0, .L0 beqz $a0, .L1 + beq $a0, $a1, .L0 blt $a0, $a1, .L1 + bl .L0 b .L1 .data diff --git a/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s b/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s index d0898aaab92f..c2526a6ecd70 100644 --- a/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s +++ b/llvm/test/MC/LoongArch/Relocations/relocation-specifier.s @@ -6,10 +6,10 @@ ## This test is similar to test/MC/CSKY/relocation-specifier.s. # RELOC32: '.rela.data' -# RELOC32: R_LARCH_32 00000000 .data + 0 +# RELOC32: R_LARCH_32 00000000 local # RELOC64: '.rela.data' -# RELOC64: R_LARCH_32 0000000000000000 .data + 0 +# RELOC64: R_LARCH_32 0000000000000000 local # CHECK: TLS GLOBAL DEFAULT UND gd # CHECK: TLS GLOBAL DEFAULT UND ld diff --git a/llvm/test/MC/LoongArch/Relocations/sub-expr.s b/llvm/test/MC/LoongArch/Relocations/sub-expr.s index 8bf046acc697..455410120081 100644 --- a/llvm/test/MC/LoongArch/Relocations/sub-expr.s +++ b/llvm/test/MC/LoongArch/Relocations/sub-expr.s @@ -1,75 +1,57 @@ # RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \ -# RUN: | llvm-readobj -r - | FileCheck %s +# RUN: | llvm-readobj -r - | FileCheck %s --check-prefixes=CHECK,NORELAX # RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \ -# RUN: | llvm-readobj -r - | FileCheck %s --check-prefix=RELAX +# RUN: | llvm-readobj -r - | FileCheck %s --check-prefixes=CHECK,RELAX ## Check that subtraction expressions emit R_LARCH_32_PCREL and R_LARCH_64_PCREL relocations. ## TODO: 1- or 2-byte data relocations are not supported for now. -# CHECK: Relocations [ -# CHECK-NEXT: Section ({{.*}}) .rela.sx { -# CHECK-NEXT: 0x4 R_LARCH_PCALA_HI20 z 0x0 -# CHECK-NEXT: 0x8 R_LARCH_PCALA_LO12 z 0x0 -# CHECK-NEXT: 0xC R_LARCH_32_PCREL .sy 0xC -# CHECK-NEXT: } +# CHECK: Relocations [ +# NORELAX-NEXT: Section ({{.*}}) .rela.sx { +# NORELAX-NEXT: 0x4 R_LARCH_PCALA_HI20 z 0x0 +# NORELAX-NEXT: 0x8 R_LARCH_PCALA_LO12 z 0x0 +# NORELAX-NEXT: 0xC R_LARCH_32_PCREL y 0x8 +# NORELAX-NEXT: } +# RELAX-NEXT: Section ({{.*}}) .rela.sx { +# RELAX-NEXT: 0x4 R_LARCH_PCALA_HI20 z 0x0 +# RELAX-NEXT: 0x4 R_LARCH_RELAX - 0x0 +# RELAX-NEXT: 0x8 R_LARCH_PCALA_LO12 z 0x0 +# RELAX-NEXT: 0x8 R_LARCH_RELAX - 0x0 +# RELAX-NEXT: 0xC R_LARCH_ADD32 y 0x0 +# RELAX-NEXT: 0xC R_LARCH_SUB32 x 0x0 +# RELAX-NEXT: } # CHECK-NEXT: Section ({{.*}}) .rela.data { -# CHECK-NEXT: 0x0 R_LARCH_64_PCREL .sx 0x4 -# CHECK-NEXT: 0x8 R_LARCH_64_PCREL .sy 0x4 -# CHECK-NEXT: 0x10 R_LARCH_32_PCREL .sx 0x4 -# CHECK-NEXT: 0x14 R_LARCH_32_PCREL .sy 0x4 -# CHECK-NEXT: 0x18 R_LARCH_ADD64 .sx 0x4 -# CHECK-NEXT: 0x18 R_LARCH_SUB64 .sy 0x4 -# CHECK-NEXT: 0x20 R_LARCH_ADD64 .sy 0x4 -# CHECK-NEXT: 0x20 R_LARCH_SUB64 .sx 0x4 -# CHECK-NEXT: 0x28 R_LARCH_ADD32 .sx 0x4 -# CHECK-NEXT: 0x28 R_LARCH_SUB32 .sy 0x4 -# CHECK-NEXT: 0x2C R_LARCH_ADD32 .sy 0x4 -# CHECK-NEXT: 0x2C R_LARCH_SUB32 .sx 0x4 -# CHECK-NEXT: 0x30 R_LARCH_ADD64 .data 0x30 -# CHECK-NEXT: 0x30 R_LARCH_SUB64 .sx 0x4 -# CHECK-NEXT: 0x38 R_LARCH_ADD32 .data 0x38 -# CHECK-NEXT: 0x38 R_LARCH_SUB32 .sy 0x4 -# CHECK-NEXT: } -# CHECK-NEXT: Section ({{.*}}) .rela.sy { -# CHECK-NEXT: 0x10 R_LARCH_32_PCREL .sx 0x10 +# CHECK-NEXT: 0x0 R_LARCH_64_PCREL x 0x0 +# CHECK-NEXT: 0x8 R_LARCH_64_PCREL y 0x0 +# CHECK-NEXT: 0x10 R_LARCH_32_PCREL x 0x0 +# CHECK-NEXT: 0x14 R_LARCH_32_PCREL y 0x0 +# CHECK-NEXT: 0x18 R_LARCH_ADD64 x 0x0 +# CHECK-NEXT: 0x18 R_LARCH_SUB64 y 0x0 +# CHECK-NEXT: 0x20 R_LARCH_ADD64 y 0x0 +# CHECK-NEXT: 0x20 R_LARCH_SUB64 x 0x0 +# CHECK-NEXT: 0x28 R_LARCH_ADD32 x 0x0 +# CHECK-NEXT: 0x28 R_LARCH_SUB32 y 0x0 +# CHECK-NEXT: 0x2C R_LARCH_ADD32 y 0x0 +# CHECK-NEXT: 0x2C R_LARCH_SUB32 x 0x0 +# CHECK-NEXT: 0x30 R_LARCH_ADD64 {{.*}} 0x0 +# CHECK-NEXT: 0x30 R_LARCH_SUB64 x 0x0 +# CHECK-NEXT: 0x38 R_LARCH_ADD32 {{.*}} 0x0 +# CHECK-NEXT: 0x38 R_LARCH_SUB32 y 0x0 # CHECK-NEXT: } +# NORELAX-NEXT: Section ({{.*}}) .rela.sy { +# NORELAX-NEXT: 0x0 R_LARCH_CALL36 foo 0x0 +# NORELAX-NEXT: 0x10 R_LARCH_32_PCREL x 0x8 +# NORELAX-NEXT: } +# RELAX-NEXT: Section ({{.*}}) .rela.sy { +# RELAX-NEXT: 0x0 R_LARCH_CALL36 foo 0x0 +# RELAX-NEXT: 0x0 R_LARCH_RELAX - 0x0 +# RELAX-NEXT: 0x8 R_LARCH_ALIGN - 0xC +# RELAX-NEXT: 0x14 R_LARCH_ADD32 x 0x0 +# RELAX-NEXT: 0x14 R_LARCH_SUB32 y 0x0 +# RELAX-NEXT: } # CHECK-NEXT: ] -# RELAX: Relocations [ -# RELAX-NEXT: Section ({{.*}}) .rela.sx { -# RELAX-NEXT: 0x4 R_LARCH_PCALA_HI20 z 0x0 -# RELAX-NEXT: 0x4 R_LARCH_RELAX - 0x0 -# RELAX-NEXT: 0x8 R_LARCH_PCALA_LO12 z 0x0 -# RELAX-NEXT: 0x8 R_LARCH_RELAX - 0x0 -# RELAX-NEXT: 0xC R_LARCH_ADD32 y 0x0 -# RELAX-NEXT: 0xC R_LARCH_SUB32 x 0x0 -# RELAX-NEXT: } -# RELAX-NEXT: Section ({{.*}}) .rela.data { -# RELAX-NEXT: 0x0 R_LARCH_64_PCREL x 0x0 -# RELAX-NEXT: 0x8 R_LARCH_64_PCREL y 0x0 -# RELAX-NEXT: 0x10 R_LARCH_32_PCREL x 0x0 -# RELAX-NEXT: 0x14 R_LARCH_32_PCREL y 0x0 -# RELAX-NEXT: 0x18 R_LARCH_ADD64 x 0x0 -# RELAX-NEXT: 0x18 R_LARCH_SUB64 y 0x0 -# RELAX-NEXT: 0x20 R_LARCH_ADD64 y 0x0 -# RELAX-NEXT: 0x20 R_LARCH_SUB64 x 0x0 -# RELAX-NEXT: 0x28 R_LARCH_ADD32 x 0x0 -# RELAX-NEXT: 0x28 R_LARCH_SUB32 y 0x0 -# RELAX-NEXT: 0x2C R_LARCH_ADD32 y 0x0 -# RELAX-NEXT: 0x2C R_LARCH_SUB32 x 0x0 -# RELAX-NEXT: 0x30 R_LARCH_ADD64 {{.*}} 0x0 -# RELAX-NEXT: 0x30 R_LARCH_SUB64 x 0x0 -# RELAX-NEXT: 0x38 R_LARCH_ADD32 {{.*}} 0x0 -# RELAX-NEXT: 0x38 R_LARCH_SUB32 y 0x0 -# RELAX-NEXT: } -# RELAX-NEXT: Section ({{.*}}) .rela.sy { -# RELAX-NEXT: 0x4 R_LARCH_ALIGN - 0xC -# RELAX-NEXT: 0x10 R_LARCH_ADD32 x 0x0 -# RELAX-NEXT: 0x10 R_LARCH_SUB32 y 0x0 -# RELAX-NEXT: } -# RELAX-NEXT: ] - .section .sx,"ax" nop x: @@ -89,7 +71,7 @@ la.pcrel $a0, z .4byte .-y .section .sy,"ax" -nop +call36 foo y: .p2align 4 .4byte x-y diff --git a/llvm/test/MC/RISCV/rv32p-invalid.s b/llvm/test/MC/RISCV/rv32p-invalid.s index 7dd73d077bf3..b00c39b8811d 100644 --- a/llvm/test/MC/RISCV/rv32p-invalid.s +++ b/llvm/test/MC/RISCV/rv32p-invalid.s @@ -4,7 +4,7 @@ # Imm overflow pli.h a0, 0x400 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [-512, 511] plui.h a1, 0x400 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-512, 1023] -pli.b a0, 0x200 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [0, 255] +pli.b a0, 0x200 # CHECK: :[[@LINE]]:11: error: immediate must be an integer in the range [-128, 255] pslli.b a6, a7, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 7] pslli.h ra, sp, 100 # CHECK: :[[@LINE]]:17: error: immediate must be an integer in the range [0, 15] @@ -106,3 +106,11 @@ ppack.w t5, a2, a4 # CHECK: :[[@LINE]]:1: error: instruction requires the follow ppackbt.w t5, s0, t5 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set ppacktb.w t5, t1, t1 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set ppackt.w t3, a0, s2 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV64I Base Instruction Set + +pli.dh a1, 1 # CHECK: :[[@LINE]]:8: error: register must be even +pli.db s1, 1 # CHECK: :[[@LINE]]:8: error: register must be even +plui.dh t2, 1 # CHECK: :[[@LINE]]:9: error: register must be even + +pli.dh a0, 0x400 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-512, 511] +pli.db a0, 0x200 # CHECK: :[[@LINE]]:12: error: immediate must be an integer in the range [-128, 255] +plui.dh a0, 0x400 # CHECK: :[[@LINE]]:13: error: immediate must be an integer in the range [-512, 1023] diff --git a/llvm/test/MC/RISCV/rv32p-valid.s b/llvm/test/MC/RISCV/rv32p-valid.s index 0383e4ec1ea6..bc7ec6587c5f 100644 --- a/llvm/test/MC/RISCV/rv32p-valid.s +++ b/llvm/test/MC/RISCV/rv32p-valid.s @@ -61,6 +61,9 @@ pli.h a5, 16 # CHECK-ASM-AND-OBJ: pli.b a6, 16 # CHECK-ASM: encoding: [0x1b,0x28,0x10,0xb4] pli.b a6, 16 +# CHECK-ASM-AND-OBJ: pli.b a6, -128 +# CHECK-ASM: encoding: [0x1b,0x28,0x80,0xb4] +pli.b a6, -128 # CHECK-ASM-AND-OBJ: psext.h.b a7, a0 # CHECK-ASM: encoding: [0x9b,0x28,0x45,0xe0] psext.h.b a7, a0 @@ -373,3 +376,19 @@ ppackt.h t3, s0, s0 # CHECK-ASM-AND-OBJ: packt a2, t3, t1 # CHECK-ASM: encoding: [0x3b,0x46,0x6e,0xb2] packt a2, t3, t1 + +# CHECK-ASM-AND-OBJ: pli.dh a4, 16 +# CHECK-ASM: encoding: [0x1b,0x27,0x10,0x30] +pli.dh a4, 16 +# CHECK-ASM-AND-OBJ: pli.db a6, 16 +# CHECK-ASM: encoding: [0x1b,0x28,0x10,0x34] +pli.db a6, 16 +# CHECK-ASM-AND-OBJ: pli.db a6, -128 +# CHECK-ASM: encoding: [0x1b,0x28,0x80,0x34] +pli.db a6, -128 +# CHECK-ASM-AND-OBJ: plui.dh tp, 32 +# CHECK-ASM: encoding: [0x1b,0x22,0x08,0x70] +plui.dh tp, 32 +# CHECK-ASM-AND-OBJ: plui.dh tp, -412 +# CHECK-ASM: encoding: [0x1b,0x22,0x99,0x70] +plui.dh tp, 612 diff --git a/llvm/test/MC/RISCV/rv64p-invalid.s b/llvm/test/MC/RISCV/rv64p-invalid.s index 58f5dfb822de..e18c9ec0e29e 100644 --- a/llvm/test/MC/RISCV/rv64p-invalid.s +++ b/llvm/test/MC/RISCV/rv64p-invalid.s @@ -65,3 +65,8 @@ mulsu.h00 a4, s4, s6 # CHECK: :[[@LINE]]:1: error: instruction requires the foll maccsu.h00 s4, s4, s0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV32I Base Instruction Set mulsu.h11 s8, s4, s0 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV32I Base Instruction Set maccsu.h11 s0, a2, s6 # CHECK: :[[@LINE]]:1: error: instruction requires the following: RV32I Base Instruction Set + +# FIXME: This error doesn't make sense. Should say that we need RV32I. +pli.dh a0, 1 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction +pli.db s0, 1 # CHECK: :[[@LINE]]:8: error: invalid operand for instruction +plui.dh t1, 1 # CHECK: :[[@LINE]]:9: error: invalid operand for instruction diff --git a/llvm/test/MC/RISCV/rv64p-valid.s b/llvm/test/MC/RISCV/rv64p-valid.s index 5e0b22759abf..f5dd85f2d338 100644 --- a/llvm/test/MC/RISCV/rv64p-valid.s +++ b/llvm/test/MC/RISCV/rv64p-valid.s @@ -79,6 +79,12 @@ pli.w a5, 5 # CHECK-ASM-AND-OBJ: pli.b a6, 6 # CHECK-ASM: encoding: [0x1b,0x28,0x06,0xb4] pli.b a6, 6 +# CHECK-ASM-AND-OBJ: pli.b a6, -1 +# CHECK-ASM: encoding: [0x1b,0x28,0xff,0xb4] +pli.b a6, -1 +# CHECK-ASM-AND-OBJ: pli.b a6, -1 +# CHECK-ASM: encoding: [0x1b,0x28,0xff,0xb4] +pli.b a6, 255 # CHECK-ASM-AND-OBJ: psext.h.b t3, a2 # CHECK-ASM: encoding: [0x1b,0x2e,0x46,0xe0] psext.h.b t3, a2 diff --git a/llvm/test/MC/RISCV/smctr-ssctr-valid.s b/llvm/test/MC/RISCV/smctr-ssctr-valid.s index 8bbd5a426b8e..072231a9b546 100644 --- a/llvm/test/MC/RISCV/smctr-ssctr-valid.s +++ b/llvm/test/MC/RISCV/smctr-ssctr-valid.s @@ -1,22 +1,22 @@ -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-smctr -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+smctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-smctr -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+smctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv32 -mattr=+experimental-ssctr -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv32 -mattr=+ssctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc %s -triple=riscv64 -mattr=+experimental-ssctr -M no-aliases -show-encoding \ +# RUN: llvm-mc %s -triple=riscv64 -mattr=+ssctr -M no-aliases -show-encoding \ # RUN: | FileCheck -check-prefixes=CHECK,CHECK-INST %s -# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-smctr < %s \ -# RUN: | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \ +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+smctr < %s \ +# RUN: | llvm-objdump --mattr=+smctr -M no-aliases -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST %s -# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+experimental-smctr < %s \ -# RUN: | llvm-objdump --mattr=+experimental-smctr -M no-aliases -d - \ +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+smctr < %s \ +# RUN: | llvm-objdump --mattr=+smctr -M no-aliases -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST %s -# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+experimental-ssctr < %s \ -# RUN: | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \ +# RUN: llvm-mc -filetype=obj -triple riscv32 -mattr=+ssctr < %s \ +# RUN: | llvm-objdump --mattr=+ssctr -M no-aliases -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST %s -# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+experimental-ssctr < %s \ -# RUN: | llvm-objdump --mattr=+experimental-ssctr -M no-aliases -d - \ +# RUN: llvm-mc -filetype=obj -triple riscv64 -mattr=+ssctr < %s \ +# RUN: | llvm-objdump --mattr=+ssctr -M no-aliases -d - \ # RUN: | FileCheck -check-prefix=CHECK-INST %s # RUN: not llvm-mc -triple riscv32 -M no-aliases -show-encoding < %s 2>&1 \ diff --git a/llvm/test/MC/RISCV/xsmtvdot-invalid.s b/llvm/test/MC/RISCV/xsmtvdot-invalid.s new file mode 100644 index 000000000000..9dce654ecc7f --- /dev/null +++ b/llvm/test/MC/RISCV/xsmtvdot-invalid.s @@ -0,0 +1,52 @@ +# RUN: not llvm-mc -triple riscv32 -mattr=+xsmtvdot < %s 2>&1 \ +# RUN: | FileCheck %s +# RUN: not llvm-mc -triple riscv64 -mattr=+xsmtvdot < %s 2>&1 \ +# RUN: | FileCheck %s + +# NoSlide +smt.vmadot v1, v2, v2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction +smt.vmadotu v1, v2, v2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction +smt.vmadotsu v1, v2, v2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction +smt.vmadotus v1, v2, v2 # CHECK: :[[@LINE]]:14: error: invalid operand for instruction + +# Slide = 1 +smt.vmadot1 v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1u v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1su v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1us v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1 v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot1u v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot1su v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot1us v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot1 v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1u v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1su v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot1us v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction + +# Slide = 2 +smt.vmadot2 v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2u v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2su v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2us v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2 v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot2u v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot2su v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot2us v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot2 v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2u v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2su v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot2us v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction + +# Slide = 3 +smt.vmadot3 v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3u v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3su v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3us v1, v2, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3 v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot3u v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot3su v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot3us v2, v1, v2 # CHECK: :[[@LINE]]:19: error: invalid operand for instruction +smt.vmadot3 v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3u v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3su v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction +smt.vmadot3us v1, v3, v2 # CHECK: :[[@LINE]]:15: error: invalid operand for instruction \ No newline at end of file diff --git a/llvm/test/MC/RISCV/xsmtvdot-valid.s b/llvm/test/MC/RISCV/xsmtvdot-valid.s new file mode 100644 index 000000000000..9e66419b10e1 --- /dev/null +++ b/llvm/test/MC/RISCV/xsmtvdot-valid.s @@ -0,0 +1,114 @@ +# RUN: llvm-mc -triple=riscv32 -show-encoding --mattr=+xsmtvdot %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsmtvdot %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +# RUN: not llvm-mc -triple=riscv32 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \ +# RUN: | FileCheck %s --check-prefix=CHECK-ERROR +# RUN: llvm-mc -triple=riscv32 -filetype=obj --mattr=+xsmtvdot %s \ +# RUN: | llvm-objdump -d --mattr=+xsmtvdot - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsmtvdot %s \ +# RUN: | llvm-objdump -d --mattr=+xsmtvdot - \ +# RUN: | FileCheck %s --check-prefix=CHECK-INST +# RUN: llvm-mc -triple=riscv32 -filetype=obj --mattr=+xsmtvdot %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN +# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsmtvdot %s \ +# RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +# CHECK-INST: smt.vmadot v16, v0, v8 +# CHECK-ENCODING: [0x2b,0x38,0x80,0xe2] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e280382b +smt.vmadot v16, v0, v8 + +# CHECK-INST: smt.vmadotu v18, v1, v9 +# CHECK-ENCODING: [0x2b,0x89,0x90,0xe2] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e290892b +smt.vmadotu v18, v1, v9 + +# CHECK-INST: smt.vmadotsu v20, v2, v10 +# CHECK-ENCODING: [0x2b,0x2a,0xa1,0xe2] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e2a12a2b +smt.vmadotsu v20, v2, v10 + +# CHECK-INST: smt.vmadotus v22, v3, v11 +# CHECK-ENCODING: [0x2b,0x9b,0xb1,0xe2] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e2b19b2b +smt.vmadotus v22, v3, v11 + +# CHECK-INST: smt.vmadot1 v24, v16, v12 +# CHECK-ENCODING: [0x2b,0x3c,0xc8,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e6c83c2b +smt.vmadot1 v24, v16, v12 + +# CHECK-INST: smt.vmadot1u v26, v18, v13 +# CHECK-ENCODING: [0x2b,0x0d,0xd9,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e6d90d2b +smt.vmadot1u v26, v18, v13 + +# CHECK-INST: smt.vmadot1su v28, v20, v14 +# CHECK-ENCODING: [0x2b,0x2e,0xea,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e6ea2e2b +smt.vmadot1su v28, v20, v14 + +# CHECK-INST: smt.vmadot1us v30, v22, v15 +# CHECK-ENCODING: [0x2b,0x1f,0xfb,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e6fb1f2b +smt.vmadot1us v30, v22, v15 + +# CHECK-INST: smt.vmadot2 v0, v24, v4 +# CHECK-ENCODING: [0x2b,0x70,0x4c,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e64c702b +smt.vmadot2 v0, v24, v4 + +# CHECK-INST: smt.vmadot2u v2, v26, v5 +# CHECK-ENCODING: [0x2b,0x41,0x5d,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e65d412b +smt.vmadot2u v2, v26, v5 + +# CHECK-INST: smt.vmadot2su v4, v28, v6 +# CHECK-ENCODING: [0x2b,0x62,0x6e,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e66e622b +smt.vmadot2su v4, v28, v6 + +# CHECK-INST: smt.vmadot2us v6, v30, v7 +# CHECK-ENCODING: [0x2b,0x53,0x7f,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e67f532b +smt.vmadot2us v6, v30, v7 + +# CHECK-INST: smt.vmadot3 v8, v0, v8 +# CHECK-ENCODING: [0x2b,0xb4,0x80,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e680b42b +smt.vmadot3 v8, v0, v8 + +# CHECK-INST: smt.vmadot3u v10, v2, v9 +# CHECK-ENCODING: [0x2b,0x85,0x91,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e691852b +smt.vmadot3u v10, v2, v9 + +# CHECK-INST: smt.vmadot3su v12, v4, v10 +# CHECK-ENCODING: [0x2b,0xa6,0xa2,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e6a2a62b +smt.vmadot3su v12, v4, v10 + +# CHECK-INST: smt.vmadot3us v14, v6, v11 +# CHECK-ENCODING: [0x2b,0x97,0xb3,0xe6] +# CHECK-ERROR: instruction requires the following: 'XSMTVDot' (SpacemiT Vector Dot Product Extension){{$}} +# CHECK-UNKNOWN: e6b3972b +smt.vmadot3us v14, v6, v11 \ No newline at end of file diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/big-filter.td b/llvm/test/TableGen/FixedLenDecoderEmitter/big-filter.td new file mode 100644 index 000000000000..b9da61de469a --- /dev/null +++ b/llvm/test/TableGen/FixedLenDecoderEmitter/big-filter.td @@ -0,0 +1,40 @@ +// RUN: llvm-tblgen -gen-disassembler -I %p/../../../include %s | FileCheck %s + +include "llvm/Target/Target.td" + +class I : Instruction { + let InOperandList = (ins); + let OutOperandList = (outs); + let Size = 16; + bits<128> Inst; +} + +// Check that a 64-bit filter with all bits set does not confuse DecoderEmitter. +// +// CHECK-LABEL: static const uint8_t DecoderTable128[] = { +// CHECK-NEXT: MCD::OPC_ExtractField, 0, 64, +// CHECK-NEXT: MCD::OPC_FilterValue, 1, 8, 0, +// CHECK-NEXT: MCD::OPC_CheckFieldOrFail, 127, 1, 1, +// CHECK-NEXT: MCD::OPC_Decode, 187, 2, 0, +// CHECK-NEXT: MCD::OPC_FilterValueOrFail, 255, 255, 255, 255, 255, 255, 255, 255, 255, 1, +// CHECK-NEXT: MCD::OPC_CheckFieldOrFail, 127, 1, 0, +// CHECK-NEXT: MCD::OPC_Decode, 186, 2, 0, +// CHECK-NEXT: MCD::OPC_Fail, +// CHECK-NEXT: 0 +// CHECK-NEXT: }; + +def I1 : I { + let Inst{63...0} = -1; + let Inst{127} = 0; +} + +def I2 : I { + let Inst{63...0} = 1; + let Inst{127} = 1; +} + +def II : InstrInfo; + +def MyTarget : Target { + let InstructionSet = II; +} diff --git a/llvm/test/TableGen/FixedLenDecoderEmitter/conflict.td b/llvm/test/TableGen/FixedLenDecoderEmitter/conflict.td index 7399ef726d0e..853a68d22d1d 100644 --- a/llvm/test/TableGen/FixedLenDecoderEmitter/conflict.td +++ b/llvm/test/TableGen/FixedLenDecoderEmitter/conflict.td @@ -29,7 +29,7 @@ def B : I<(outs GPR32:$dst), (ins GPR32:$src1), []> { } // CHECK: Decoding Conflict: -// CHECK: 00000000000000000000000000000000 -// CHECK: ................................ -// CHECK: A 00000000000000000000000000000000 -// CHECK: B 00000000000000000000000000000000 +// CHECK: ................................ +// CHECK: 00000000000000000000000000000000 +// CHECK: 00000000000000000000000000000000 A +// CHECK: 00000000000000000000000000000000 B diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/trunc.ll b/llvm/test/Transforms/CorrelatedValuePropagation/trunc.ll index 9b6604298840..42a89ab0dbc0 100644 --- a/llvm/test/Transforms/CorrelatedValuePropagation/trunc.ll +++ b/llvm/test/Transforms/CorrelatedValuePropagation/trunc.ll @@ -106,3 +106,43 @@ define i1 @overdefined_range_negative(i8 %A, i8 %B) { %trunc = trunc i8 %xor to i1 ret i1 %trunc } + +define i1 @trunc_nuw_infere_false_for_icmp_ne_1(i8 %x) { +; CHECK-LABEL: define i1 @trunc_nuw_infere_false_for_icmp_ne_1( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp ne i8 [[X]], 1 +; CHECK-NEXT: br i1 [[ICMP]], label %[[IFTRUE:.*]], label %[[IFFALSE:.*]] +; CHECK: [[IFTRUE]]: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i8 [[X]] to i1 +; CHECK-NEXT: ret i1 false +; CHECK: [[IFFALSE]]: +; CHECK-NEXT: ret i1 true +; + %icmp = icmp ne i8 %x, 1 + br i1 %icmp, label %iftrue, label %iffalse +iftrue: + %trunc = trunc nuw i8 %x to i1 + ret i1 %trunc +iffalse: + ret i1 true +} + +define i1 @neg_trunc_do_not_infere_false_for_icmp_ne_1(i8 %x) { +; CHECK-LABEL: define i1 @neg_trunc_do_not_infere_false_for_icmp_ne_1( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ICMP:%.*]] = icmp ne i8 [[X]], 1 +; CHECK-NEXT: br i1 [[ICMP]], label %[[IFTRUE:.*]], label %[[IFFALSE:.*]] +; CHECK: [[IFTRUE]]: +; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X]] to i1 +; CHECK-NEXT: ret i1 [[TRUNC]] +; CHECK: [[IFFALSE]]: +; CHECK-NEXT: ret i1 true +; + %icmp = icmp ne i8 %x, 1 + br i1 %icmp, label %iftrue, label %iffalse +iftrue: + %trunc = trunc i8 %x to i1 + ret i1 %trunc +iffalse: + ret i1 true +} diff --git a/llvm/test/Transforms/GlobalOpt/stored-once-addrspacecast.ll b/llvm/test/Transforms/GlobalOpt/stored-once-addrspacecast.ll new file mode 100644 index 000000000000..35678598032d --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/stored-once-addrspacecast.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals +; RUN: opt -passes=globalopt < %s -S | FileCheck %s + +; Test that we do not fold away addresscasts when optimizing once-stored +; globals, as these may be runtime operations. + +@g1 = internal global ptr null +@g2 = addrspace(1) global i32 0 + +;. +; CHECK: @g1 = internal unnamed_addr global ptr null +; CHECK: @g2 = addrspace(1) global i32 0 +; CHECK: @g4 = local_unnamed_addr addrspace(1) global i32 0 +;. +define i64 @test1() { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) @g2 to ptr +; CHECK-NEXT: store ptr [[TMP1]], ptr @g1, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr @g1, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: ret i64 [[TMP3]] +; + %l1 = addrspacecast ptr addrspace(1) @g2 to ptr + store ptr %l1, ptr @g1, align 8 + %l2 = load ptr, ptr @g1, align 8 + %l3 = load i64, ptr %l2, align 8 + ret i64 %l3 +} + +@g3 = internal global ptr null +@g4 = addrspace(1) global i32 0 + +define i64 @test2() { +; CHECK-LABEL: @test2( +; CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr addrspacecast (ptr addrspace(1) @g4 to ptr), align 8 +; CHECK-NEXT: ret i64 [[TMP1]] +; + store ptr addrspacecast (ptr addrspace(1) @g4 to ptr), ptr @g3, align 8 + %l1 = load ptr, ptr @g3, align 8 + %l2 = load i64, ptr %l1, align 8 + ret i64 %l2 +} diff --git a/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll b/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll index 562701420f80..a040c3cc6947 100644 --- a/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll +++ b/llvm/test/Transforms/LICM/PR116813-memoryssa-outdated.ll @@ -18,7 +18,7 @@ define i32 @foo(i1 %arg, ptr %arg1) { ; CHECK: [[BB1]]: ; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi ptr [ [[ARG1]], %[[BB0]] ] ; CHECK-NEXT: [[I3_US:%.*]] = call i32 [[UNSWITCHED_SELECT_US]]() -; CHECK-NEXT: br i1 true, label %[[LOOP_US]], label %[[RET_SPLIT_US:.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 true, label %[[LOOP_US]], label %[[RET_SPLIT_US:.*]] ; CHECK: [[RET_SPLIT_US]]: ; CHECK-NEXT: [[I3_LCSSA_US:%.*]] = phi i32 [ [[I3_US]], %[[BB1]] ] ; CHECK-NEXT: br label %[[RET:.*]] diff --git a/llvm/test/Transforms/LICM/hoist-profdata.ll b/llvm/test/Transforms/LICM/hoist-profdata.ll new file mode 100644 index 000000000000..18fa1b9f92e8 --- /dev/null +++ b/llvm/test/Transforms/LICM/hoist-profdata.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --version 2 +; Test that hoisting conditional branches copies the debug and profiling info +; metadata from the branch being hoisted. +; RUN: opt -S -passes=licm %s -o - | FileCheck %s + +declare i32 @foo() + +; to_hoist should get hoisted, and that should not result +; in a loss of profiling info +define i32 @hoist_select(i1 %cond, i32 %a, i32 %b) nounwind { +; CHECK-LABEL: define i32 @hoist_select +; CHECK-SAME: (i1 [[COND:%.*]], i32 [[A:%.*]], i32 [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TO_HOIST:%.*]] = select i1 [[COND]], i32 [[A]], i32 [[B]], !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: br label [[L0:%.*]] +; CHECK: L0: +; CHECK-NEXT: [[G:%.*]] = call i32 @foo() +; CHECK-NEXT: [[SUM:%.*]] = add i32 [[G]], [[TO_HOIST]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[SUM]], 0 +; CHECK-NEXT: br i1 [[C]], label [[L0]], label [[EXIT:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[SUM_LCSSA:%.*]] = phi i32 [ [[SUM]], [[L0]] ] +; CHECK-NEXT: ret i32 [[SUM_LCSSA]] +; +entry: + br label %L0 +L0: + %g = call i32 @foo() + %to_hoist = select i1 %cond, i32 %a, i32 %b, !prof !0 + %sum = add i32 %g, %to_hoist + %c = icmp eq i32 %sum, 0 + br i1 %c, label %L0, label %exit, !prof !1 + +exit: + ret i32 %sum +} + +!0 = !{!"branch_weights", i32 2, i32 5} +!1 = !{!"branch_weights", i32 101, i32 189} +;. +; CHECK: attributes #[[ATTR0]] = { nounwind } +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 2, i32 5} +; CHECK: [[PROF1]] = !{!"branch_weights", i32 101, i32 189} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll index b7706da36428..f099c22333c3 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/call-costs.ll @@ -72,7 +72,7 @@ exit: define void @powi_call(ptr %P) { ; CHECK-LABEL: define void @powi_call( ; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -84,10 +84,9 @@ define void @powi_call(ptr %P) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds double, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load double, ptr [[GEP]], align 8 ; CHECK-NEXT: [[POWI:%.*]] = tail call double @llvm.powi.f64.i32(double [[L]], i32 3) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll index 511622d28d64..626242667e20 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/clamped-trip-count.ll @@ -15,8 +15,7 @@ define void @clamped_tc_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range(1,1 ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP8]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP1]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP1]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: @@ -90,8 +89,7 @@ define void @clamped_tc_max_8(ptr nocapture %dst, i32 %n, i64 %val) vscale_range ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.stepvector.nxv8i64() ; CHECK-NEXT: [[TMP7:%.*]] = mul [[TMP8]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i64 1, [[TMP1]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP1]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll index 1819d7f56153..bfebbdad5af0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll @@ -119,8 +119,7 @@ define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i ; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul [[TMP15]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP6]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: @@ -235,8 +234,7 @@ define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) { ; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.stepvector.nxv2i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul [[TMP15]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = mul i64 1, [[TMP6]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TMP20]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll index 221d944e1bc2..0cb46e18c536 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll @@ -8,7 +8,7 @@ target triple = "aarch64-unknown-linux" define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %dst.1, ptr noalias %src.1, ptr %src.2) { ; CHECK-LABEL: define void @check_widen_intrinsic_with_nnan( ; CHECK-SAME: ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[SRC_1:%.*]], ptr [[SRC_2:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -70,10 +70,9 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds double, ptr [[SRC_1]], i64 [[IV]] ; CHECK-NEXT: [[L_1:%.*]] = load double, ptr [[GEP_SRC_1]], align 8 ; CHECK-NEXT: [[ABS:%.*]] = tail call nnan double @llvm.fabs.f64(double [[L_1]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll index 927d1b82bc48..ad184bec2ac7 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll @@ -6,7 +6,7 @@ target triple = "arm64-apple-macosx14.0.0" define double @test_reduction_costs() { ; CHECK-LABEL: define double @test_reduction_costs() { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -19,14 +19,11 @@ define double @test_reduction_costs() { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX2:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_1:.*]] ; CHECK: [[LOOP_1]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_1]] ] -; CHECK-NEXT: [[R_1:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[R_1_NEXT:%.*]], %[[LOOP_1]] ] -; CHECK-NEXT: [[R_2:%.*]] = phi double [ [[BC_MERGE_RDX2]], %[[SCALAR_PH]] ], [ [[R_2_NEXT:%.*]], %[[LOOP_1]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_1]] ] +; CHECK-NEXT: [[R_1:%.*]] = phi double [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[R_1_NEXT:%.*]], %[[LOOP_1]] ] +; CHECK-NEXT: [[R_2:%.*]] = phi double [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[R_2_NEXT:%.*]], %[[LOOP_1]] ] ; CHECK-NEXT: [[R_1_NEXT]] = fadd double [[R_1]], 3.000000e+00 ; CHECK-NEXT: [[R_2_NEXT]] = fadd double [[R_2]], 9.000000e+00 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll index aa2ec2de14c2..fed57c919c6c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll @@ -171,14 +171,11 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP5]], i32 2 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[START]], [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[RECUR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], [[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[RECUR_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[PTR_IV]], align 4 ; CHECK-NEXT: [[RECUR_NEXT]] = zext i32 [[L]] to i64 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 8d86de521b41..c23695dc5dbe 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -65,8 +65,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = mul i64 1, [[TMP17]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP27]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: @@ -201,8 +200,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; CHECK-NEXT: [[TMP38:%.*]] = mul [[TMP25]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = mul i64 1, [[TMP17]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP39]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP17]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll index d45dbcc8b166..0327334e220f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/invariant-replicate-region.ll @@ -8,7 +8,7 @@ target triple = "arm64-apple-macosx14.0.0" define i32 @test_invariant_replicate_region(i32 %x, i1 %c) { ; CHECK-LABEL: define i32 @test_invariant_replicate_region( ; CHECK-SAME: i32 [[X:%.*]], i1 [[C:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 @@ -52,10 +52,9 @@ define i32 @test_invariant_replicate_region(i32 %x, i1 %c) { ; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] ; CHECK: [[THEN]]: ; CHECK-NEXT: [[REM_1:%.*]] = urem i32 10, [[X]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll index 110685e377dd..1f486fba069c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -474,7 +474,7 @@ while.end: define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) { ; CHECK-LABEL: define i32 @tc4( ; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -490,12 +490,10 @@ define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) { ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[SUM_0179:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[SUM_0179:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[TMP]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD]] = add i32 [[SUM_0179]], [[TMP5]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll index 1159a641f5ce..2ca117c33dbb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/mul-simplification.ll @@ -45,7 +45,7 @@ exit: define i32 @add_reduction_select_operand_constant_but_non_uniform() { ; CHECK-LABEL: define i32 @add_reduction_select_operand_constant_but_non_uniform() { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -63,12 +63,10 @@ define i32 @add_reduction_select_operand_constant_but_non_uniform() { ; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 42, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD2_REASS:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[ADD2_REASS:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ 42, %[[SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[ADD2_REASS]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[RDX_NEXT]] = add i32 0, [[RDX]] ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD2_REASS]], 64 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll index 410abfbc2f2b..67e6902b5d32 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/optsize_minsize.ll @@ -17,7 +17,7 @@ target triple = "aarch64-unknown-linux-gnu" define void @always_vectorize(ptr %p, i32 %x) { ; DEFAULT-LABEL: define void @always_vectorize( ; DEFAULT-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) { -; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 @@ -31,10 +31,9 @@ define void @always_vectorize(ptr %p, i32 %x) { ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] ; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] @@ -47,7 +46,7 @@ define void @always_vectorize(ptr %p, i32 %x) { ; ; OPTSIZE-LABEL: define void @always_vectorize( ; OPTSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: [[ENTRY:.*:]] ; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; OPTSIZE: [[VECTOR_PH]]: ; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 @@ -61,10 +60,9 @@ define void @always_vectorize(ptr %p, i32 %x) { ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] ; OPTSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OPTSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] @@ -77,7 +75,7 @@ define void @always_vectorize(ptr %p, i32 %x) { ; ; MINSIZE-LABEL: define void @always_vectorize( ; MINSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: [[ENTRY:.*:]] ; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; MINSIZE: [[VECTOR_PH]]: ; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 @@ -91,10 +89,9 @@ define void @always_vectorize(ptr %p, i32 %x) { ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: -; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] ; MINSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; MINSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] @@ -510,8 +507,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; DEFAULT-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i8 1) ; DEFAULT-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] ; DEFAULT-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP1]] to i8 -; DEFAULT-NEXT: [[TMP13:%.*]] = mul i8 1, [[TMP12]] -; DEFAULT-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP13]], i64 0 +; DEFAULT-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP12]], i64 0 ; DEFAULT-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: @@ -578,8 +574,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; OPTSIZE-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i8 1) ; OPTSIZE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] ; OPTSIZE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP1]] to i8 -; OPTSIZE-NEXT: [[TMP13:%.*]] = mul i8 1, [[TMP12]] -; OPTSIZE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP13]], i64 0 +; OPTSIZE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP12]], i64 0 ; OPTSIZE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] ; OPTSIZE: [[VECTOR_BODY]]: @@ -646,8 +641,7 @@ define void @sve_tail_predicate_without_minsize(ptr %p, i8 %a, i8 %b, i8 %c, i32 ; MINSIZE-NEXT: [[TMP11:%.*]] = mul [[TMP10]], splat (i8 1) ; MINSIZE-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP11]] ; MINSIZE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP1]] to i8 -; MINSIZE-NEXT: [[TMP13:%.*]] = mul i8 1, [[TMP12]] -; MINSIZE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP13]], i64 0 +; MINSIZE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP12]], i64 0 ; MINSIZE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] ; MINSIZE: [[VECTOR_BODY]]: @@ -721,7 +715,7 @@ for.cond.cleanup: ; FIXME: We currently vectorize with minsize as the trunc cost is incorrect define void @dont_vectorize_with_minsize() { ; DEFAULT-LABEL: define void @dont_vectorize_with_minsize() { -; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] @@ -753,10 +747,9 @@ define void @dont_vectorize_with_minsize() { ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -775,7 +768,7 @@ define void @dont_vectorize_with_minsize() { ; ; OPTSIZE-LABEL: define void @dont_vectorize_with_minsize( ; OPTSIZE-SAME: ) #[[ATTR0]] { -; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: [[ENTRY:.*:]] ; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; OPTSIZE: [[VECTOR_PH]]: ; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -797,10 +790,9 @@ define void @dont_vectorize_with_minsize() { ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -819,7 +811,7 @@ define void @dont_vectorize_with_minsize() { ; ; MINSIZE-LABEL: define void @dont_vectorize_with_minsize( ; MINSIZE-SAME: ) #[[ATTR0]] { -; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: [[ENTRY:.*:]] ; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; MINSIZE: [[VECTOR_PH]]: ; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -841,10 +833,9 @@ define void @dont_vectorize_with_minsize() { ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: -; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -891,7 +882,7 @@ for.cond.cleanup: ; FIXME: We currently use width 2 as the load/store cost is incorrect. define void @vectorization_forced_minsize_reduce_width() { ; DEFAULT-LABEL: define void @vectorization_forced_minsize_reduce_width() { -; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] @@ -923,10 +914,9 @@ define void @vectorization_forced_minsize_reduce_width() { ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -945,7 +935,7 @@ define void @vectorization_forced_minsize_reduce_width() { ; ; OPTSIZE-LABEL: define void @vectorization_forced_minsize_reduce_width( ; OPTSIZE-SAME: ) #[[ATTR0]] { -; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: [[ENTRY:.*:]] ; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; OPTSIZE: [[VECTOR_PH]]: ; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -967,10 +957,9 @@ define void @vectorization_forced_minsize_reduce_width() { ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -989,7 +978,7 @@ define void @vectorization_forced_minsize_reduce_width() { ; ; MINSIZE-LABEL: define void @vectorization_forced_minsize_reduce_width( ; MINSIZE-SAME: ) #[[ATTR0]] { -; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: [[ENTRY:.*:]] ; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; MINSIZE: [[VECTOR_PH]]: ; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1011,10 +1000,9 @@ define void @vectorization_forced_minsize_reduce_width() { ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: -; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll index 87a18ba2c18e..50df6fcd3cdc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/outer_loop_prefer_scalable.ll @@ -21,33 +21,32 @@ define void @foo() { ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul [[TMP4]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP3]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_LATCH:%.*]] ] ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_LATCH]] ] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP7]], i32 4, splat (i1 true), poison) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x float], ptr @A, i64 0, [[VEC_IND]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP6]], i32 4, splat (i1 true), poison) ; CHECK-NEXT: br label [[INNER_LOOP1:%.*]] ; CHECK: inner_loop1: -; CHECK-NEXT: [[TMP8:%.*]] = phi [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP12:%.*]], [[INNER_LOOP1]] ] -; CHECK-NEXT: [[TMP9:%.*]] = phi [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, [[TMP8]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP10]], i32 4, splat (i1 true), poison) -; CHECK-NEXT: [[TMP11]] = fmul [[TMP9]], [[WIDE_MASKED_GATHER2]] -; CHECK-NEXT: [[TMP12]] = add nuw nsw [[TMP8]], splat (i64 1) -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq [[TMP12]], splat (i64 512) -; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[TMP13]], i32 0 -; CHECK-NEXT: br i1 [[TMP14]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]] +; CHECK-NEXT: [[TMP7:%.*]] = phi [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP11:%.*]], [[INNER_LOOP1]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP10:%.*]], [[INNER_LOOP1]] ] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [512 x float], ptr @B, i64 0, [[TMP7]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP9]], i32 4, splat (i1 true), poison) +; CHECK-NEXT: [[TMP10]] = fmul [[TMP8]], [[WIDE_MASKED_GATHER2]] +; CHECK-NEXT: [[TMP11]] = add nuw nsw [[TMP7]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq [[TMP11]], splat (i64 512) +; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[TMP12]], i32 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[VECTOR_LATCH]], label [[INNER_LOOP1]] ; CHECK: vector.latch: -; CHECK-NEXT: [[TMP15:%.*]] = phi [ [[TMP11]], [[INNER_LOOP1]] ] -; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0( [[TMP15]], [[TMP7]], i32 4, splat (i1 true)) +; CHECK-NEXT: [[TMP14:%.*]] = phi [ [[TMP10]], [[INNER_LOOP1]] ] +; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0( [[TMP14]], [[TMP6]], i32 4, splat (i1 true)) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll index 62971b5ea3f8..2521ece2eea0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-avoid-scalarization.ll @@ -29,8 +29,7 @@ define void @test_no_scalarization(ptr %a, ptr noalias %b, i32 %idx, i32 %n) #0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = mul [[TMP8]], splat (i32 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = mul i32 1, [[TMP5]] -; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP13]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 ; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector [[DOTSPLATINSERT1]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll index 1ff59bd3a4c1..b7016ff4abf8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll @@ -128,10 +128,9 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] @@ -202,10 +201,9 @@ define i64 @loop_contains_safe_call() #1 { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) @@ -367,10 +365,9 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index 1dd8dd531e17..ef111caafbf0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -34,8 +34,7 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i64 1) -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP8]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP7]] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START_1]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, [[VECTOR_GEP]], i64 1 @@ -45,8 +44,7 @@ define void @pointer_induction_used_as_vector(ptr noalias %start.1, ptr noalias ; CHECK-NEXT: [[TMP12:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP12]], ptr [[TMP10]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 1, [[TMP3]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP11]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -116,15 +114,13 @@ define void @pointer_induction(ptr noalias %start, i64 %N) { ; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi ptr [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul [[TMP12]], splat (i64 1) -; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP14]] +; CHECK-NEXT: [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[VECTOR_GEP]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP15]], align 1 ; CHECK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD]], splat (i8 1) ; CHECK-NEXT: store [[TMP17]], ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX2]], [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP4]] -; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP10]] +; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll index 3b04df3fec5e..e450fe7b54d4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -319,10 +319,9 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]] @@ -372,10 +371,9 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] @@ -425,10 +423,9 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll index 000e09004368..4dbe0d96f4de 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll @@ -8,7 +8,7 @@ target triple = "arm64-apple-macosx" define void @load_store_interleave_group_tc_2(ptr noalias %data) { ; VF2-LABEL: define void @load_store_interleave_group_tc_2( ; VF2-SAME: ptr noalias [[DATA:%.*]]) { -; VF2-NEXT: [[ENTRY:.*]]: +; VF2-NEXT: [[ENTRY:.*:]] ; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF2: [[VECTOR_PH]]: ; VF2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -23,10 +23,9 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) { ; VF2: [[MIDDLE_BLOCK]]: ; VF2-NEXT: br label %[[EXIT:.*]] ; VF2: [[SCALAR_PH]]: -; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF2-NEXT: br label %[[LOOP:.*]] ; VF2: [[LOOP]]: -; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; VF2-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1 ; VF2-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]] ; VF2-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8 @@ -226,7 +225,7 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias % ; ; VF2-LABEL: define void @test_complex_add_float_tc_4( ; VF2-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { -; VF2-NEXT: [[ENTRY:.*]]: +; VF2-NEXT: [[ENTRY:.*:]] ; VF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF2: [[VECTOR_PH]]: ; VF2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -252,10 +251,9 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias % ; VF2: [[MIDDLE_BLOCK]]: ; VF2-NEXT: br label %[[EXIT:.*]] ; VF2: [[SCALAR_PH]]: -; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF2-NEXT: br label %[[LOOP:.*]] ; VF2: [[LOOP]]: -; VF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; VF2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]] ; VF2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]] ; VF2-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 @@ -278,7 +276,7 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias % ; ; VF4-LABEL: define void @test_complex_add_float_tc_4( ; VF4-SAME: ptr [[RES:%.*]], ptr noalias [[A:%.*]], ptr noalias [[B:%.*]]) { -; VF4-NEXT: [[ENTRY:.*]]: +; VF4-NEXT: [[ENTRY:.*:]] ; VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF4: [[VECTOR_PH]]: ; VF4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -298,10 +296,9 @@ define void @test_complex_add_float_tc_4(ptr %res, ptr noalias %A, ptr noalias % ; VF4: [[MIDDLE_BLOCK]]: ; VF4-NEXT: br label %[[EXIT:.*]] ; VF4: [[SCALAR_PH]]: -; VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF4-NEXT: br label %[[LOOP:.*]] ; VF4: [[LOOP]]: -; VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VF4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; VF4-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[A]], i64 [[IV]] ; VF4-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds nuw { float, float }, ptr [[B]], i64 [[IV]] ; VF4-NEXT: [[L_A_0:%.*]] = load float, ptr [[GEP_A_0]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll index 4df02a78a480..ee5f1929f41e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll @@ -7,7 +7,7 @@ target triple = "arm64-apple-macosx" define void @load_store_interleave_group(ptr noalias %data) { ; CHECK-LABEL: define void @load_store_interleave_group( ; CHECK-SAME: ptr noalias [[DATA:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -28,10 +28,9 @@ define void @load_store_interleave_group(ptr noalias %data) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[MUL_2:%.*]] = shl nsw i64 [[IV]], 1 ; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[MUL_2]] ; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8 @@ -70,7 +69,7 @@ exit: define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { ; CHECK-LABEL: define void @test_2xi64_with_wide_load( ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -107,10 +106,9 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] ; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[TMP13:%.*]] = shl nsw i64 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll index a2cbf6f9c5a0..0ada7d0f2257 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-insertelt.ll @@ -50,10 +50,9 @@ define void @test0(ptr noalias %M3, ptr noalias %A, ptr noalias %B) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_INC1286_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[IF_THEN1165_US:%.*]] ; CHECK: if.then1165.us: -; CHECK-NEXT: [[INDVARS_IV1783:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ] +; CHECK-NEXT: [[INDVARS_IV1783:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV1783]] ; CHECK-NEXT: [[L_A:%.*]] = load i16, ptr [[GEP_A]], align 2 ; CHECK-NEXT: [[CONV1177_US:%.*]] = zext i16 [[L_A]] to i32 @@ -143,10 +142,9 @@ define void @test1(ptr noalias %M3, ptr noalias %A, ptr noalias %B, ptr noalias ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_INC1286_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[IF_THEN1165_US:%.*]] ; CHECK: if.then1165.us: -; CHECK-NEXT: [[INDVARS_IV1783:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ] +; CHECK-NEXT: [[INDVARS_IV1783:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT1784:%.*]], [[IF_THEN1165_US]] ] ; CHECK-NEXT: [[FPTR:%.*]] = load i32, ptr [[C]], align 4 ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV1783]] ; CHECK-NEXT: [[L_A:%.*]] = load i16, ptr [[GEP_A]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll index a431fdd3178b..21928ce71500 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-loop-backedge-elimination-epilogue.ll @@ -34,18 +34,15 @@ define void @test_remove_vector_loop_region_epilogue(ptr %dst, i1 %c) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] ; CHECK: [[VEC_EPILOG_PH]]: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i64 [[TC]], 8 -; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[TC]], [[N_MOD_VF2]] ; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] ; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[VEC_EPILOG_RESUME_VAL]] ; CHECK-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP5]], align 4 ; CHECK-NEXT: br label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]] ; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[TC]], [[N_VEC3]] -; CHECK-NEXT: br i1 [[CMP_N4]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: br i1 true, label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] ; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 664af3c35b5f..d4e5dea3d4aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -81,8 +81,6 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<1024>, ir<16> -; CHECK-NEXT: EMIT vp<[[VEC_TC:%.+]]> = sub ir<1024>, vp<%n.mod.vf> ; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: @@ -98,13 +96,12 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a> ; CHECK-NEXT: PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul> ; CHECK-NEXT: EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16> -; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[EP_IV_NEXT]]>, ir<1024> ; CHECK-NEXT: Successor(s): middle.block, vector.body ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: ; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<%accum>, ir<%add> -; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> -; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: EMIT branch-on-cond ir ; CHECK-NEXT: Successor(s): ir-bb, ir-bb ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: @@ -112,7 +109,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: -; CHECK-NEXT: EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<[[EP_RESUME:%.+]]> = phi [ ir<1024>, middle.block ], [ ir<0>, ir-bb ] ; CHECK-NEXT: EMIT-SCALAR vp<[[EP_MERGE:%.+]]> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb ] ; CHECK-NEXT: EMIT-SCALAR vp<%6> = resume-for-epilogue vp<%vec.epilog.resume.val> ; CHECK-NEXT: Successor(s): ir-bb diff --git a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll index 3f1d0dc2ff2a..dcf4bee728b2 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/optsize_minsize.ll @@ -17,7 +17,7 @@ target triple = "armv7a-none-eabi" define void @always_vectorize(ptr %p, i32 %x) { ; DEFAULT-LABEL: define void @always_vectorize( ; DEFAULT-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) { -; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 @@ -31,10 +31,9 @@ define void @always_vectorize(ptr %p, i32 %x) { ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] ; DEFAULT-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; DEFAULT-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] @@ -47,7 +46,7 @@ define void @always_vectorize(ptr %p, i32 %x) { ; ; OPTSIZE-LABEL: define void @always_vectorize( ; OPTSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: [[ENTRY:.*:]] ; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; OPTSIZE: [[VECTOR_PH]]: ; OPTSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 @@ -61,10 +60,9 @@ define void @always_vectorize(ptr %p, i32 %x) { ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] ; OPTSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OPTSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] @@ -77,7 +75,7 @@ define void @always_vectorize(ptr %p, i32 %x) { ; ; MINSIZE-LABEL: define void @always_vectorize( ; MINSIZE-SAME: ptr [[P:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: [[ENTRY:.*:]] ; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; MINSIZE: [[VECTOR_PH]]: ; MINSIZE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i64 0 @@ -91,10 +89,9 @@ define void @always_vectorize(ptr %p, i32 %x) { ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: -; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV]] ; MINSIZE-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; MINSIZE-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP4]], [[X]] @@ -484,7 +481,7 @@ for.cond.cleanup: ; we don't account for the addressing mode difference. define void @dont_vectorize_with_minsize() { ; DEFAULT-LABEL: define void @dont_vectorize_with_minsize() { -; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] @@ -506,10 +503,9 @@ define void @dont_vectorize_with_minsize() { ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -528,7 +524,7 @@ define void @dont_vectorize_with_minsize() { ; ; OPTSIZE-LABEL: define void @dont_vectorize_with_minsize( ; OPTSIZE-SAME: ) #[[ATTR0]] { -; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: [[ENTRY:.*:]] ; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; OPTSIZE: [[VECTOR_PH]]: ; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -550,10 +546,9 @@ define void @dont_vectorize_with_minsize() { ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -572,7 +567,7 @@ define void @dont_vectorize_with_minsize() { ; ; MINSIZE-LABEL: define void @dont_vectorize_with_minsize( ; MINSIZE-SAME: ) #[[ATTR0]] { -; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: [[ENTRY:.*:]] ; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; MINSIZE: [[VECTOR_PH]]: ; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -594,10 +589,9 @@ define void @dont_vectorize_with_minsize() { ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: -; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -644,7 +638,7 @@ for.cond.cleanup: ; FIXME: We currently use width 2 as the load/store cost is incorrect. define void @vectorization_forced() { ; DEFAULT-LABEL: define void @vectorization_forced() { -; DEFAULT-NEXT: [[ENTRY:.*]]: +; DEFAULT-NEXT: [[ENTRY:.*:]] ; DEFAULT-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] @@ -666,10 +660,9 @@ define void @vectorization_forced() { ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; DEFAULT: [[SCALAR_PH]]: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; DEFAULT-NEXT: br label %[[FOR_BODY:.*]] ; DEFAULT: [[FOR_BODY]]: -; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; DEFAULT-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; DEFAULT-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; DEFAULT-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; DEFAULT-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -688,7 +681,7 @@ define void @vectorization_forced() { ; ; OPTSIZE-LABEL: define void @vectorization_forced( ; OPTSIZE-SAME: ) #[[ATTR0]] { -; OPTSIZE-NEXT: [[ENTRY:.*]]: +; OPTSIZE-NEXT: [[ENTRY:.*:]] ; OPTSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; OPTSIZE: [[VECTOR_PH]]: ; OPTSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -710,10 +703,9 @@ define void @vectorization_forced() { ; OPTSIZE: [[MIDDLE_BLOCK]]: ; OPTSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; OPTSIZE: [[SCALAR_PH]]: -; OPTSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; OPTSIZE-NEXT: br label %[[FOR_BODY:.*]] ; OPTSIZE: [[FOR_BODY]]: -; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; OPTSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; OPTSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; OPTSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OPTSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] @@ -732,7 +724,7 @@ define void @vectorization_forced() { ; ; MINSIZE-LABEL: define void @vectorization_forced( ; MINSIZE-SAME: ) #[[ATTR0]] { -; MINSIZE-NEXT: [[ENTRY:.*]]: +; MINSIZE-NEXT: [[ENTRY:.*:]] ; MINSIZE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; MINSIZE: [[VECTOR_PH]]: ; MINSIZE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -754,10 +746,9 @@ define void @vectorization_forced() { ; MINSIZE: [[MIDDLE_BLOCK]]: ; MINSIZE-NEXT: br label %[[FOR_COND_CLEANUP:.*]] ; MINSIZE: [[SCALAR_PH]]: -; MINSIZE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; MINSIZE-NEXT: br label %[[FOR_BODY:.*]] ; MINSIZE: [[FOR_BODY]]: -; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; MINSIZE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; MINSIZE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @B, i64 0, i64 [[INDVARS_IV]] ; MINSIZE-NEXT: [[BVAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; MINSIZE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [1000 x i32], ptr @C, i64 0, i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll index 17eeafa574ae..8072a3d97313 100644 --- a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll +++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll @@ -35,10 +35,9 @@ define void @vector_add(ptr noalias nocapture %a, i64 %v) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ELEM]], [[V]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index 983b36caabfc..3af328fb6568 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -63,10 +63,9 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]] @@ -150,10 +149,9 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]] @@ -237,10 +235,9 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[DIVREM:%.*]] = urem i64 [[ELEM]], [[V]] @@ -324,10 +321,9 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[DIVREM:%.*]] = srem i64 [[ELEM]], [[V]] @@ -428,10 +424,9 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[C:%.*]] = icmp ne i64 [[V]], 0 @@ -544,10 +539,9 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[C:%.*]] = icmp ne i64 [[V]], 0 @@ -654,10 +648,9 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[C:%.*]] = icmp ne i64 [[ELEM]], 42 @@ -764,10 +757,9 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; FIXED-NEXT: [[C:%.*]] = icmp ne i64 [[ELEM]], 42 @@ -876,10 +868,9 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[FOR_END:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[FOR_BODY:%.*]] ; FIXED: for.body: -; FIXED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; FIXED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] ; FIXED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] ; FIXED-NEXT: [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; FIXED-NEXT: [[C:%.*]] = icmp ne i8 [[ELEM]], -128 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll index 86afa5541044..be6f32a6f4ea 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll @@ -20,8 +20,7 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[EVL_BASED_IV]] ; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0( [[VEC_IND]], ptr align 8 [[TMP14]], splat (i1 true), i32 [[TMP11]]) diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index 378478c00cd5..dc963f1bf264 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -79,10 +79,9 @@ define void @load_store_factor2_i32(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -251,10 +250,9 @@ define void @load_store_factor2_i64(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -434,10 +432,9 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -635,10 +632,9 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -845,10 +841,9 @@ define void @load_store_factor4(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 4 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1084,10 +1079,9 @@ define void @load_store_factor5(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 5 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1351,10 +1345,9 @@ define void @load_store_factor6(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 6 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1647,10 +1640,9 @@ define void @load_store_factor7(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 7 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -1969,10 +1961,9 @@ define void @load_store_factor8(ptr %p) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 @@ -2244,10 +2235,9 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 @@ -2408,10 +2398,9 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; FIXED: middle.block: ; FIXED-NEXT: br label [[EXIT:%.*]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] ; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 ; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] ; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll index 1bceb871bd99..48e8a1dac348 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll @@ -115,8 +115,6 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] @@ -138,7 +136,7 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv32i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP13]], i32 1, [[INTERLEAVED_MASK5]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 ; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: @@ -298,8 +296,6 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true) ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 ; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 -; PREDICATED_DATA-WITH-EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv16i32() ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP3:%.*]] = icmp ult [[TMP2]], [[BROADCAST_SPLAT4]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP4:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] @@ -325,7 +321,7 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali ; PREDICATED_DATA-WITH-EVL-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC]], ptr [[TMP18]], i32 1, [[INTERLEAVED_MASK5]]) ; PREDICATED_DATA-WITH-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP1]], [[EVL_BASED_IV]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[AVL_NEXT]] = sub nuw i32 [[AVL]], [[TMP1]] -; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT2]] +; PREDICATED_DATA-WITH-EVL-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT4]] ; PREDICATED_DATA-WITH-EVL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], 1024 ; PREDICATED_DATA-WITH-EVL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; PREDICATED_DATA-WITH-EVL: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll index 1c4a47a4815f..a46d877825f8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -27,8 +27,7 @@ define void @test(ptr noalias nocapture %a, ptr noalias nocapture %b, i32 %v) { ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; VLENUNK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP7]] to i64 -; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP15]] -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP15]], i64 0 ; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; VLENUNK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv4i32() ; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[TMP10]], [[BROADCAST_SPLAT4]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll index edc2b43d132c..37a0a8b4d7c8 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr87378-vpinstruction-or-drop-poison-generating-flags.ll @@ -29,8 +29,7 @@ define void @pr87378_vpinstruction_or_drop_poison_generating_flags(ptr %arg, i64 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP25]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP25]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.stepvector.nxv8i32() ; CHECK-NEXT: [[TMP11:%.*]] = icmp ult [[TMP10]], [[BROADCAST_SPLAT8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll index 68afe686c606..5820aaee1677 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr88802.ll @@ -26,13 +26,10 @@ define void @test(ptr %p, i64 %a, i8 %b) { ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[AVL:%.*]] = phi i32 [ 9, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[FOR_COND]] ] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 2, i1 true) -; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = mul i32 1, [[TMP11]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP20]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector [[BROADCAST_SPLATINSERT7]], poison, zeroinitializer ; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.stepvector.nxv2i32() -; CHECK-NEXT: [[TMP13:%.*]] = icmp ult [[TMP19]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp ult [[TMP19]], [[BROADCAST_SPLAT8]] ; CHECK-NEXT: [[TMP14:%.*]] = icmp sge [[VEC_IND]], splat (i32 2) ; CHECK-NEXT: [[TMP15:%.*]] = select [[TMP13]], [[TMP14]], zeroinitializer ; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP15]], [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll index b7a51662a578..e0436e8eb002 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/safe-dep-distance.ll @@ -83,10 +83,9 @@ define void @test_may_clobber(ptr %p) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; CHECK-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; CHECK-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 100 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll index 59f1f4a6e54c..47979d358690 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll @@ -18,8 +18,7 @@ define void @single_constant_stride_int_scaled(ptr %p) { ; CHECK-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP12]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = mul nuw nsw [[VEC_IND]], splat (i64 8) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[P:%.*]], [[TMP14]] @@ -496,8 +495,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) { ; STRIDED-NEXT: [[AVL:%.*]] = phi i64 [ 1024, [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; STRIDED-NEXT: [[TMP43:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; STRIDED-NEXT: [[TMP44:%.*]] = zext i32 [[TMP43]] to i64 -; STRIDED-NEXT: [[TMP45:%.*]] = mul i64 1, [[TMP44]] -; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP45]], i64 0 +; STRIDED-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement poison, i64 [[TMP44]], i64 0 ; STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT9]], poison, zeroinitializer ; STRIDED-NEXT: [[TMP18:%.*]] = mul nuw nsw [[VEC_IND]], [[BROADCAST_SPLAT1]] ; STRIDED-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[P]], [[TMP18]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll index fb71f6c187a3..7ea462eed42d 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cast-intrinsics.ll @@ -1203,8 +1203,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], [[VEC_IND]] ; IF-EVL-NEXT: [[TMP15:%.*]] = ptrtoint [[TMP14]] to @@ -1247,8 +1246,7 @@ define void @vp_ptrtoint(ptr %a, ptr %b, i64 %N) { ; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() ; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP3]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: br label %[[VECTOR_BODY:.*]] ; NO-VP: [[VECTOR_BODY]]: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll index 8bfd93b53bc7..fe3a723a9bbe 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-cond-reduction.ll @@ -458,8 +458,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[TMP13:%.*]] = mul i32 1, [[TMP12]] -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP13]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV1]] ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP14]], splat (i1 true), i32 [[TMP12]]) @@ -509,8 +508,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]] -; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 ; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-INLOOP-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[EVL_BASED_IV1]] ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP11]]) @@ -561,8 +559,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul [[TMP12]], splat (i32 1) ; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] ; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP7]] to i32 -; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP16]] -; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP16]], i64 0 ; NO-VP-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP-OUTLOOP: vector.body: @@ -618,8 +615,7 @@ define i32 @step_cond_add(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i32 1) ; NO-VP-INLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP3]] to i32 -; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]] -; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 ; NO-VP-INLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP-INLOOP: vector.body: @@ -696,13 +692,10 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-OUTLOOP-NEXT: [[VEC_IND2:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-OUTLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer -; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP14]] -; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[TMP14]], i64 0 ; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call @llvm.stepvector.nxv4i32() -; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ult [[TMP12]], [[BROADCAST_SPLAT4]] +; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = icmp ult [[TMP12]], [[BROADCAST_SPLAT2]] ; IF-EVL-OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-OUTLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], splat (i1 true), i32 [[TMP14]]) ; IF-EVL-OUTLOOP-NEXT: [[TMP21:%.*]] = icmp sle [[VP_OP_LOAD]], [[VEC_IND2]] @@ -756,8 +749,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = phi i64 [ [[N]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-INLOOP-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) -; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = mul i32 1, [[TMP11]] -; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP12]], i64 0 +; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 ; IF-EVL-INLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; IF-EVL-INLOOP-NEXT: [[VP_OP_LOAD:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[ARRAYIDX]], splat (i1 true), i32 [[TMP11]]) @@ -811,8 +803,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: [[TMP14:%.*]] = mul [[TMP12]], splat (i32 1) ; NO-VP-OUTLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP14]] ; NO-VP-OUTLOOP-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP7]] to i32 -; NO-VP-OUTLOOP-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP16]] -; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 +; NO-VP-OUTLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP16]], i64 0 ; NO-VP-OUTLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; NO-VP-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP-OUTLOOP: vector.body: @@ -872,8 +863,7 @@ define i32 @step_cond_add_pred(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: [[TMP8:%.*]] = mul [[TMP6]], splat (i32 1) ; NO-VP-INLOOP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP8]] ; NO-VP-INLOOP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP3]] to i32 -; NO-VP-INLOOP-NEXT: [[TMP11:%.*]] = mul i32 1, [[TMP10]] -; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP11]], i64 0 +; NO-VP-INLOOP-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 ; NO-VP-INLOOP-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; NO-VP-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP-INLOOP: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll index c0988380f8f1..bf54f669b65f 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-fixed-order-recurrence.ll @@ -603,8 +603,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) { ; IF-EVL-NEXT: [[PREV_EVL:%.*]] = phi i32 [ [[TMP5]], %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP11]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP7:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP7]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP20]] = add [[VEC_IND]], splat (i64 42) ; IF-EVL-NEXT: [[TMP15:%.*]] = call @llvm.experimental.vp.splice.nxv2i64( [[VECTOR_RECUR]], [[TMP20]], i32 -1, splat (i1 true), i32 [[PREV_EVL]], i32 [[TMP11]]) @@ -647,8 +646,7 @@ define void @first_order_recurrence_indvar(ptr noalias %A, i64 %TC) { ; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() ; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[TMP10:%.*]] = mul i64 1, [[TMP3]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP10]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() ; NO-VP-NEXT: [[TMP16:%.*]] = mul nuw i32 [[TMP14]], 2 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll index e16bb64073a0..6f723c268914 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-gather-scatter.ll @@ -22,8 +22,7 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[ENTRY]] ], [ [[AVL_NEXT:%.*]], [[FOR_BODY]] ] ; IF-EVL-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; IF-EVL-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64 -; IF-EVL-NEXT: [[TMP13:%.*]] = mul i64 1, [[TMP12]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP12]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[INDEX:%.*]], [[VEC_IND]] ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv2i64.nxv2p0( align 8 [[TMP14]], splat (i1 true), i32 [[TMP11]]) @@ -69,8 +68,7 @@ define void @gather_scatter(ptr noalias %in, ptr noalias %out, ptr noalias %inde ; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv2i64() ; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP3]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: br label [[FOR_BODY:%.*]] ; NO-VP: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll index fbd4658e18b5..332c16e8eb65 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-interleave.ll @@ -148,8 +148,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP4]]) @@ -206,8 +205,7 @@ define i32 @load_factor_4_with_gap(i64 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP3]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: @@ -299,8 +297,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[VEC_IND2:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i32 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP7:%.*]] = mul i32 1, [[TMP6]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND2]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[VEC_IND2]], align 4 [[TMP10]], splat (i1 true), i32 [[TMP6]]) @@ -345,8 +342,7 @@ define void @store_factor_4_with_gap(i32 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv4i32() ; NO-VP-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) ; NO-VP-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP8]] -; NO-VP-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP9]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: @@ -428,8 +424,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[AVL:%.*]] = phi i64 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP4:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 4, i1 true) ; IF-EVL-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 -; IF-EVL-NEXT: [[TMP6:%.*]] = mul i64 1, [[TMP5]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP6]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND]], i32 0 ; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.vp.gather.nxv4i32.nxv4p0( align 4 [[TMP7]], splat (i1 true), i32 [[TMP4]]) @@ -486,8 +481,7 @@ define i32 @load_factor_4_with_tail_gap(i64 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i64() ; NO-VP-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i64 1) ; NO-VP-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; NO-VP-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP3]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: @@ -580,8 +574,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) { ; IF-EVL-NEXT: [[VEC_IND2:%.*]] = phi [ [[INDUCTION1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[AVL:%.*]] = phi i32 [ [[N:%.*]], [[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], [[VECTOR_BODY]] ] ; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 4, i1 true) -; IF-EVL-NEXT: [[TMP7:%.*]] = mul i32 1, [[TMP6]] -; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 +; IF-EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[TMP6]], i64 0 ; IF-EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; IF-EVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds [4 x i32], ptr [[A:%.*]], [[VEC_IND2]], i32 0 ; IF-EVL-NEXT: call void @llvm.vp.scatter.nxv4i32.nxv4p0( [[VEC_IND2]], align 4 [[TMP10]], splat (i1 true), i32 [[TMP6]]) @@ -626,8 +619,7 @@ define void @store_factor_4_with_tail_gap(i32 %n, ptr noalias %a) { ; NO-VP-NEXT: [[TMP7:%.*]] = call @llvm.stepvector.nxv4i32() ; NO-VP-NEXT: [[TMP8:%.*]] = mul [[TMP7]], splat (i32 1) ; NO-VP-NEXT: [[INDUCTION1:%.*]] = add zeroinitializer, [[TMP8]] -; NO-VP-NEXT: [[TMP10:%.*]] = mul i32 1, [[TMP9]] -; NO-VP-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP10]], i64 0 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[TMP9]], i64 0 ; NO-VP-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] ; NO-VP: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll index 6e810f71102d..21e87fdc7586 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-safe-dep-distance.ll @@ -127,10 +127,9 @@ define void @test_may_clobber1(ptr %p) { ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; IF-EVL-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; IF-EVL-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 100 @@ -160,10 +159,9 @@ define void @test_may_clobber1(ptr %p) { ; NO-VP: middle.block: ; NO-VP-NEXT: br label [[EXIT:%.*]] ; NO-VP: scalar.ph: -; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; NO-VP-NEXT: br label [[LOOP:%.*]] ; NO-VP: loop: -; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; NO-VP-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; NO-VP-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; NO-VP-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 100 @@ -263,10 +261,9 @@ define void @test_may_clobber3(ptr %p) { ; IF-EVL: middle.block: ; IF-EVL-NEXT: br label [[EXIT:%.*]] ; IF-EVL: scalar.ph: -; IF-EVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; IF-EVL-NEXT: br label [[LOOP:%.*]] ; IF-EVL: loop: -; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; IF-EVL-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; IF-EVL-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; IF-EVL-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; IF-EVL-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 10 @@ -296,10 +293,9 @@ define void @test_may_clobber3(ptr %p) { ; NO-VP: middle.block: ; NO-VP-NEXT: br label [[EXIT:%.*]] ; NO-VP: scalar.ph: -; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; NO-VP-NEXT: br label [[LOOP:%.*]] ; NO-VP: loop: -; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; NO-VP-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; NO-VP-NEXT: [[A1:%.*]] = getelementptr i64, ptr [[P]], i64 [[IV]] ; NO-VP-NEXT: [[V:%.*]] = load i64, ptr [[A1]], align 32 ; NO-VP-NEXT: [[OFFSET:%.*]] = add i64 [[IV]], 10 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 7e10ce6def11..e3a93cbf450a 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -282,8 +282,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP17]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; SCALABLE-NEXT: [[TMP8:%.*]] = zext i32 [[TMP17]] to i64 -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[TMP18:%.*]] = call @llvm.stepvector.nxv4i32() ; SCALABLE-NEXT: [[TMP11:%.*]] = icmp ult [[TMP18]], [[BROADCAST_SPLAT4]] @@ -387,8 +386,7 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i32 [[TMP7]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP7]] to i64 -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] -; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; TF-SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP16:%.*]] = call @llvm.stepvector.nxv4i32() ; TF-SCALABLE-NEXT: [[ACTIVE_LANE_MASK:%.*]] = icmp ult [[TMP16]], [[BROADCAST_SPLAT4]] @@ -710,8 +708,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; SCALABLE-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 -; SCALABLE-NEXT: [[INDEX:%.*]] = mul i64 1, [[TMP8]] -; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[INDEX]], i64 0 +; SCALABLE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 ; SCALABLE-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT1]], splat (i1 true), i32 [[TMP7]]) ; SCALABLE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP10]] @@ -794,8 +791,7 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; TF-SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: [[TMP13:%.*]] = zext i32 [[TMP9]] to i64 -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP13]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i64 [[TMP13]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer ; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[VEC_IND]], align 8 [[BROADCAST_SPLAT]], splat (i1 true), i32 [[TMP9]]) ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]] @@ -857,8 +853,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; SCALABLE-NEXT: [[TMP7:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; SCALABLE-NEXT: [[TMP14:%.*]] = zext i32 [[TMP7]] to i64 -; SCALABLE-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP14]] -; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP14]], i64 0 ; SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) ; SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[BROADCAST_SPLAT2]], [[TMP10]], i32 [[TMP7]]) @@ -956,8 +951,7 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; TF-SCALABLE-NEXT: [[AVL:%.*]] = phi i64 [ 1025, %[[VECTOR_PH]] ], [ [[AVL_NEXT:%.*]], %[[VECTOR_BODY]] ] ; TF-SCALABLE-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true) ; TF-SCALABLE-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64 -; TF-SCALABLE-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP11]] -; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP8]], i64 0 +; TF-SCALABLE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TMP11]], i64 0 ; TF-SCALABLE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; TF-SCALABLE-NEXT: [[TMP10:%.*]] = icmp ugt [[VEC_IND]], splat (i64 10) ; TF-SCALABLE-NEXT: call void @llvm.vp.scatter.nxv2i64.nxv2p0( [[BROADCAST_SPLAT1]], align 8 [[BROADCAST_SPLAT2]], [[TMP10]], i32 [[TMP9]]) diff --git a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll index a91bc656cc7e..87f81881be32 100644 --- a/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll +++ b/llvm/test/Transforms/LoopVectorize/SystemZ/scalar-steps-with-users-demanding-all-lanes-and-first-lane-only.ll @@ -11,7 +11,7 @@ target datalayout = "E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-v128:64-a:8:16-n32:64" define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(ptr noalias %dst, ptr noalias %src.1) { ; CHECK-LABEL: define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst( ; CHECK-SAME: ptr noalias [[DST:%.*]], ptr noalias [[SRC_1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -67,10 +67,9 @@ define void @test_scalar_iv_steps_used_by_replicate_and_first_lane_only_vpinst(p ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[MUL_IV:%.*]] = mul nsw i64 [[IV]], 4 ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i8, ptr [[SRC_1]], i64 [[MUL_IV]] ; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll index c7a0bcb71d11..a614d9a17550 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/constant-fold.ll @@ -24,10 +24,9 @@ define void @f1() { ; CHECK: middle.block: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[BB1:%.*]] ] ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ] +; CHECK-NEXT: [[C_1_0:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[_TMP9:%.*]], [[BB2]] ] ; CHECK-NEXT: [[_TMP1:%.*]] = zext i16 0 to i64 ; CHECK-NEXT: [[_TMP2:%.*]] = getelementptr [1 x %rec8], ptr @a, i16 0, i64 [[_TMP1]] ; CHECK-NEXT: [[_TMP6:%.*]] = sext i16 [[C_1_0]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll index 04e0dafba6b8..aecfc668cf29 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-constant-known-via-scev.ll @@ -62,7 +62,7 @@ exit: ; Test case for https://github.com/llvm/llvm-project/issues/109528. define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-LABEL: define i64 @second_lshr_operand_zero_via_scev() { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[EXT_0:%.*]] = sext i8 0 to i32 ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -97,12 +97,10 @@ define i64 @second_lshr_operand_zero_via_scev() { ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOPS:.*]] ; CHECK: [[LOOPS]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOPS]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOPS]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOPS]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOPS]] ] ; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 0 ; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[IV]] to i32 diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll index a30a8c9e6a02..472aa0b5b716 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -1108,12 +1108,10 @@ define i64 @cost_loop_invariant_recipes(i1 %x, i64 %y) { ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> [[TMP3]]) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 1, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT_I_I_I:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_MUL:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT_I_I_I:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ 1, [[SCALAR_PH]] ], [ [[RED_MUL:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[NOT_X:%.*]] = xor i1 [[X]], true ; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[NOT_X]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[Y]], [[EXT]] @@ -1165,12 +1163,10 @@ define i32 @narrowed_reduction(ptr %a, i1 %cmp) #0 { ; CHECK-NEXT: [[TMP21:%.*]] = zext i1 [[TMP20]] to i32 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INC:%.*]], [[LOOP1]] ] -; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[OR:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[VEC_EPILOG_PH]] ], [ [[INC:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OR13:%.*]] = phi i32 [ 0, [[VEC_EPILOG_PH]] ], [ [[OR:%.*]], [[LOOP1]] ] ; CHECK-NEXT: [[AND:%.*]] = and i32 [[OR13]], 1 ; CHECK-NEXT: [[OR]] = or i32 [[AND]], [[CONV]] ; CHECK-NEXT: [[INC]] = add i32 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll index 7fe4c14781e8..8164c10ac371 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/imprecise-through-phis.ll @@ -92,12 +92,10 @@ define double @sumIfVector(ptr nocapture readonly %arr) { ; SSE-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> [[BIN_RDX]]) ; SSE-NEXT: br label [[DONE:%.*]] ; SSE: scalar.ph: -; SSE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] -; SSE-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ] ; SSE-NEXT: br label [[LOOP:%.*]] ; SSE: loop: -; SSE-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] -; SSE-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] +; SSE-NEXT: [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] +; SSE-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] ; SSE-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]] ; SSE-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 ; SSE-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 @@ -157,12 +155,10 @@ define double @sumIfVector(ptr nocapture readonly %arr) { ; AVX-NEXT: [[TMP21:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[BIN_RDX11]]) ; AVX-NEXT: br label [[DONE:%.*]] ; AVX: scalar.ph: -; AVX-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] -; AVX-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ] ; AVX-NEXT: br label [[LOOP:%.*]] ; AVX: loop: -; AVX-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] -; AVX-NEXT: [[TOT:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] +; AVX-NEXT: [[I:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[NEXT_ITER:%.*]] ] +; AVX-NEXT: [[TOT:%.*]] = phi double [ 0.000000e+00, [[SCALAR_PH]] ], [ [[TOT_NEXT:%.*]], [[NEXT_ITER]] ] ; AVX-NEXT: [[ADDR:%.*]] = getelementptr double, ptr [[ARR]], i32 [[I]] ; AVX-NEXT: [[NEXTVAL:%.*]] = load double, ptr [[ADDR]], align 8 ; AVX-NEXT: [[TST:%.*]] = fcmp fast une double [[NEXTVAL]], 4.200000e+01 diff --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll index 116a3822eac6..ff2846f235c9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll @@ -413,12 +413,10 @@ define i16 @iv_and_step_trunc() { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[REC_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[REC_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[IV_NEXT]] to i16 diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 6d562be03a9b..361482e4cb74 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.15.0" define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noalias %p.invar, ptr noalias %dst.1, ptr noalias %dst.2) { ; CHECK-LABEL: define void @test_free_instructions_feeding_geps_for_interleave_groups( ; CHECK-SAME: ptr noalias [[P_INVAR:%.*]], ptr noalias [[DST_1:%.*]], ptr noalias [[DST_2:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -39,10 +39,9 @@ define void @test_free_instructions_feeding_geps_for_interleave_groups(ptr noali ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[L_0:%.*]] = load float, ptr [[P_INVAR]], align 4 ; CHECK-NEXT: [[IV_MUL:%.*]] = shl i64 [[IV]], 2 ; CHECK-NEXT: [[GEP_DST_19:%.*]] = getelementptr float, ptr [[DST_1]], i64 [[IV_MUL]] @@ -507,7 +506,7 @@ exit: define void @interleave_store_double_i64(ptr %dst) { ; CHECK-LABEL: define void @interleave_store_double_i64( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -525,10 +524,9 @@ define void @interleave_store_double_i64(ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]], i32 1 ; CHECK-NEXT: store i64 [[IV]], ptr [[GEP_1]], align 8 ; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr { double, i64 }, ptr [[DST]], i64 [[IV]] @@ -628,7 +626,7 @@ exit: define void @interleave_store_i64_double_2(ptr %dst) { ; CHECK-LABEL: define void @interleave_store_i64_double_2( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -646,10 +644,9 @@ define void @interleave_store_i64_double_2(ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i64 [[IV]], ptr [[GEP_0]], align 8 ; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr { i64, double }, ptr [[DST]], i64 [[IV]], i32 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index f615e23bcb8b..452868ddd59c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -65,12 +65,10 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[EARLYCND:%.*]] = icmp slt i64 [[IV]], [[LEN]] ; CHECK-NEXT: br i1 [[EARLYCND]], label [[PRED:%.*]], label [[LATCH]] @@ -216,12 +214,10 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -396,12 +392,10 @@ define i32 @test_invariant_address(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP101:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -667,12 +661,10 @@ define i32 @test_step_narrower_than_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP149:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX38]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -984,12 +976,10 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1024, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -1228,12 +1218,10 @@ define i32 @test_non_unit_stride(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP117:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX8]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 2 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -1380,12 +1368,10 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -1532,12 +1518,10 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -1684,12 +1668,10 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -2005,12 +1987,10 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -2158,12 +2138,10 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 @@ -2321,12 +2299,10 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) { ; CHECK-NEXT: [[TMP77:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll index f26064a4a81d..6a90f03c2f7f 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll @@ -1201,10 +1201,9 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b ; O1VEC2: middle.block: ; O1VEC2-NEXT: br label [[FOR_END:%.*]] ; O1VEC2: scalar.ph: -; O1VEC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; O1VEC2-NEXT: br label [[FOR_BODY:%.*]] ; O1VEC2: for.body: -; O1VEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; O1VEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; O1VEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV]] ; O1VEC2-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; O1VEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[N]] @@ -1242,10 +1241,9 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b ; OzVEC2: middle.block: ; OzVEC2-NEXT: br label [[FOR_END:%.*]] ; OzVEC2: scalar.ph: -; OzVEC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; OzVEC2-NEXT: br label [[FOR_BODY:%.*]] ; OzVEC2: for.body: -; OzVEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; OzVEC2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; OzVEC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[B]], i64 [[INDVARS_IV]] ; OzVEC2-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; OzVEC2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP10]], [[N]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll index 056b8ecddd88..6d7b8a222c51 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -222,10 +222,9 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ] ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]] ; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 @@ -259,10 +258,9 @@ define void @scev4stride1(ptr noalias nocapture %a, ptr noalias nocapture readon ; AUTOVF: middle.block: ; AUTOVF-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] ; AUTOVF: scalar.ph: -; AUTOVF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER:%.*]] ] ; AUTOVF-NEXT: br label [[FOR_BODY:%.*]] ; AUTOVF: for.body: -; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[SCALAR_PH]] ] ; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]] ; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[MUL]] ; AUTOVF-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll index c1d08e152fc5..9181cce613ad 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr141968-instsimplifyfolder.ll @@ -6,7 +6,7 @@ target triple = "x86_64" define i8 @pr141968(i1 %cond, i8 %v) { ; CHECK-LABEL: define i8 @pr141968( ; CHECK-SAME: i1 [[COND:%.*]], i8 [[V:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[ZEXT_TRUE:%.*]] = zext i1 true to i16 ; CHECK-NEXT: [[SEXT:%.*]] = sext i8 [[V]] to i16 ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -107,10 +107,9 @@ define i8 @pr141968(i1 %cond, i8 %v) { ; CHECK-NEXT: [[TMP18:%.*]] = extractelement <16 x i8> [[PREDPHI]], i32 15 ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[SCALAR_PH]] ] ; CHECK-NEXT: br i1 [[COND]], label %[[LOOP_LATCH]], label %[[COND_FALSE:.*]] ; CHECK: [[COND_FALSE]]: ; CHECK-NEXT: [[SDIV:%.*]] = sdiv i16 [[SEXT]], [[ZEXT_TRUE]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll index df2e35d3922d..204271173da0 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr34438.ll @@ -24,10 +24,9 @@ define void @small_tc(ptr noalias nocapture %A, ptr noalias nocapture readonly % ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll index 36f0f14e732c..2c97863c182b 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr51366-sunk-instruction-used-outside-of-loop.ll @@ -4,7 +4,7 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) { ; CHECK-LABEL: define ptr @test( ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -45,10 +45,9 @@ define ptr @test(ptr noalias %src, ptr noalias %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV]], 0 ; CHECK-NEXT: br i1 [[CMP_1]], label %[[LOOP_LATCH]], label %[[THEN:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index 0e83cf374fc3..04cdc759d812 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -73,12 +73,10 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) { ; CHECK-NEXT: [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ 0.000000e+00, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]] ; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]] @@ -138,12 +136,10 @@ define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) { ; CHECK-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ -0.000000e+00, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]] ; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]] @@ -203,12 +199,10 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array) ; CHECK-NEXT: [[TMP9:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) ; CHECK-NEXT: br label [[LOOP_EXIT_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IDX:%.*]] = phi i32 [ [[IDX_INC:%.*]], [[LOOP]] ], [ 0, [[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi float [ [[SUM_INC:%.*]], [[LOOP]] ], [ -0.000000e+00, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ADDRESS:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[IDX]] ; CHECK-NEXT: [[VALUE:%.*]] = load float, ptr [[ADDRESS]], align 4 ; CHECK-NEXT: [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll index 0b61f207d041..28435d4f34ac 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-uniform-call.ll @@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu" define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK-LABEL: define void @smax_call_uniform( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[C:%.*]] = icmp ult i8 -68, -69 ; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i64 [[X]], 0 ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -55,10 +55,9 @@ define void @smax_call_uniform(ptr %dst, i64 %x) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: [[REM1:%.*]] = urem i64 [[MUL]], [[X]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll index 33b173d3a700..35f61b2aa838 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/strided_load_cost.ll @@ -556,10 +556,9 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0 ; CHECK-NEXT: [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]] @@ -676,10 +675,9 @@ define void @test(ptr %A, ptr noalias %B) #0 { ; MAX-BW: middle.block: ; MAX-BW-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; MAX-BW: scalar.ph: -; MAX-BW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; MAX-BW-NEXT: br label [[FOR_BODY:%.*]] ; MAX-BW: for.body: -; MAX-BW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; MAX-BW-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; MAX-BW-NEXT: [[IV_0:%.*]] = add nuw nsw i64 [[IV]], 0 ; MAX-BW-NEXT: [[IV_1:%.*]] = add nuw nsw i64 [[IV]], 1 ; MAX-BW-NEXT: [[IN0:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i64 0, i64 [[IV_0]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll index 68b4f202e106..a491a6233fda 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll @@ -406,12 +406,10 @@ define i32 @test_count_bits(ptr %test_base) { ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[BYTE:%.*]] = udiv i64 [[IV]], 8 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[BYTE]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll index 6979f4fc199a..52f491eed030 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -204,10 +204,9 @@ define void @vectorized2(ptr noalias nocapture %A, ptr noalias nocapture readonl ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]] ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll index 05d08a4e3635..c49d36962796 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/widened-value-used-as-scalar-and-first-lane.ll @@ -50,10 +50,9 @@ define void @iv.4_used_as_vector_and_first_lane(ptr %src, ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[G_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[G_SRC]], align 8 ; CHECK-NEXT: [[IV_4:%.*]] = add nuw nsw i64 [[IV]], 4 @@ -134,10 +133,9 @@ define void @iv.4_used_as_first_lane(ptr %src, ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[G_SRC:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i64, ptr [[G_SRC]], align 8 ; CHECK-NEXT: [[IV_4:%.*]] = add nuw nsw i64 [[IV]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll index b0ae40cafbde..34c6384b63c8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/x86-predication.ll @@ -136,12 +136,10 @@ define i32 @predicated_sdiv_masked_load(ptr %a, ptr %b, i32 %x, i1 %c) { ; SINK-GATHER-NEXT: [[TMP49:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP47]]) ; SINK-GATHER-NEXT: br label [[FOR_END:%.*]] ; SINK-GATHER: scalar.ph: -; SINK-GATHER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; SINK-GATHER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; SINK-GATHER-NEXT: br label [[FOR_BODY:%.*]] ; SINK-GATHER: for.body: -; SINK-GATHER-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] -; SINK-GATHER-NEXT: [[R:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[T7:%.*]], [[FOR_INC]] ] +; SINK-GATHER-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] +; SINK-GATHER-NEXT: [[R:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[T7:%.*]], [[FOR_INC]] ] ; SINK-GATHER-NEXT: [[T0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]] ; SINK-GATHER-NEXT: [[T1:%.*]] = load i32, ptr [[T0]], align 4 ; SINK-GATHER-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll index af528eee503d..e629560354f2 100644 --- a/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll +++ b/llvm/test/Transforms/LoopVectorize/constantfolder-infer-correct-gepty.ll @@ -6,7 +6,7 @@ define void @test(ptr %data) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr [[DATA:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -20,10 +20,9 @@ define void @test(ptr %data) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[OR_IV_1:%.*]] = or disjoint i64 [[IV]], 1 ; CHECK-NEXT: [[GEP_POSTSCALE:%.*]] = getelementptr [64 x float], ptr @postscale, i64 0, i64 [[OR_IV_1]] ; CHECK-NEXT: [[LOAD_POSTSCALE:%.*]] = load float, ptr [[GEP_POSTSCALE]], align 4, !tbaa [[TBAA0]] diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll index cfd36bfe3652..9fbd1330de74 100644 --- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll +++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll @@ -4,7 +4,7 @@ define void @const_fold_ptradd(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_ptradd( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -17,10 +17,9 @@ define void @const_fold_ptradd(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -59,7 +58,7 @@ exit: define void @const_fold_inbounds_ptradd(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_inbounds_ptradd( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -72,10 +71,9 @@ define void @const_fold_inbounds_ptradd(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -114,7 +112,7 @@ exit: define void @const_fold_select(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_select( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[D]], 1 @@ -128,10 +126,9 @@ define void @const_fold_select(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -170,7 +167,7 @@ exit: define void @const_fold_add_sub_mul_ashr_lshr(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_add_sub_mul_ashr_lshr( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -183,10 +180,9 @@ define void @const_fold_add_sub_mul_ashr_lshr(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -233,7 +229,7 @@ exit: define void @const_fold_and_or_xor(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_and_or_xor( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -246,10 +242,9 @@ define void @const_fold_and_or_xor(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -292,7 +287,7 @@ exit: define void @const_fold_cmp_zext(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_cmp_zext( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -305,10 +300,9 @@ define void @const_fold_cmp_zext(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -349,7 +343,7 @@ exit: define void @const_fold_trunc(ptr %dst, i64 %d) { ; CHECK-LABEL: define void @const_fold_trunc( ; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -362,10 +356,9 @@ define void @const_fold_trunc(ptr %dst, i64 %d) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 true, label %[[LOOP_LATCH]], label %[[ELSE:.*]] ; CHECK: [[ELSE]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] diff --git a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll index e9c7f75cb337..fbdc11dd9847 100644 --- a/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll +++ b/llvm/test/Transforms/LoopVectorize/create-induction-resume.ll @@ -69,12 +69,10 @@ define void @test(i32 %arg, i32 %L1.limit, i32 %L2.switch, i1 %c, ptr %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[L2_HEADER_LOOPEXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[L2_INNER_HEADER_PREHEADER]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 1, [[L2_INNER_HEADER_PREHEADER]] ] ; CHECK-NEXT: br label [[L2_INNER_HEADER:%.*]] ; CHECK: L2.Inner.header: -; CHECK-NEXT: [[L2_ACCUM:%.*]] = phi i32 [ [[L2_ACCUM_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[L2_IV:%.*]] = phi i64 [ [[L2_IV_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[L2_ACCUM:%.*]] = phi i32 [ [[L2_ACCUM_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ 1, [[SCALAR_PH]] ] +; CHECK-NEXT: [[L2_IV:%.*]] = phi i64 [ [[L2_IV_NEXT:%.*]], [[L2_INNER_HEADER]] ], [ 1, [[SCALAR_PH]] ] ; CHECK-NEXT: [[L2_ACCUM_NEXT]] = sub i32 [[L2_ACCUM]], [[L1_EXIT_VAL]] ; CHECK-NEXT: [[L2_DUMMY_BUT_NEED_IT:%.*]] = sext i32 [[L2_ACCUM_NEXT]] to i64 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[L2_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll index aa1b6cee0987..e3a8ca777ddd 100644 --- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll @@ -13,7 +13,7 @@ define void @foo(ptr %h) !dbg !4 { ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]], !dbg [[DBG21]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_LATCH:.*]] ], !dbg [[DBG22:![0-9]+]] -; CHECK-NEXT: br label %[[FOR_COND5_PREHEADER1:.*]] +; CHECK-NEXT: br label %[[FOR_COND5_PREHEADER1:.*]], !dbg [[DBG21]] ; CHECK: [[FOR_COND5_PREHEADER1]]: ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_BODY]] ], [ [[TMP5:%.*]], %[[FOR_COND5_PREHEADER1]] ], !dbg [[DBG23:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, ptr [[H]], <4 x i64> [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll index 9ade6e9a8980..ab9a84dddf92 100644 --- a/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll +++ b/llvm/test/Transforms/LoopVectorize/debugloc-optimize-vfuf-term.ll @@ -6,7 +6,7 @@ define i32 @foo(ptr %p) { ; CHECK-LABEL: define i32 @foo( ; CHECK-SAME: ptr [[P:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -16,10 +16,9 @@ define i32 @foo(ptr %p) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]], !dbg [[DBG3]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], !dbg [[DBG7:![0-9]+]] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], !dbg [[DBG7]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ], !dbg [[DBG7:![0-9]+]] ; CHECK-NEXT: [[CONV:%.*]] = trunc i64 0 to i8, !dbg [[DBG8:![0-9]+]] ; CHECK-NEXT: store i8 [[CONV]], ptr [[P]], align 1, !dbg [[DBG3]] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1, !dbg [[DBG9:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll index 373c8e0b385c..d24e7e871239 100644 --- a/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll +++ b/llvm/test/Transforms/LoopVectorize/dereferenceable-info-from-assumption-constant-size.ll @@ -6,7 +6,7 @@ declare void @llvm.assume(i1) define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -50,10 +50,9 @@ define void @deref_assumption_in_header_constant_trip_count(ptr noalias noundef ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 4) ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] @@ -104,7 +103,7 @@ exit: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -127,10 +126,9 @@ define void @align_deref_assumption_in_header_constant_trip_count_loop_invariant ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -178,7 +176,7 @@ exit: define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_header_constant_trip_count( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -222,10 +220,9 @@ define void @deref_assumption_too_small_in_header_constant_trip_count(ptr noalia ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[GEP_A]], i64 4), "dereferenceable"(ptr [[GEP_A]], i64 2) ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] @@ -276,7 +273,7 @@ exit: define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -320,10 +317,9 @@ define void @deref_assumption_in_header_constant_trip_count_align_1(ptr noalias ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] @@ -374,7 +370,7 @@ exit: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute(ptr noalias align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attribute( ; CHECK-SAME: ptr noalias align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -418,10 +414,9 @@ define void @deref_assumption_in_header_constant_trip_count_align_via_arg_attrib ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] @@ -472,7 +467,7 @@ exit: define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_header_constant_trip_count_align_not_known( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -516,10 +511,9 @@ define void @deref_assumption_in_header_constant_trip_count_align_not_known(ptr ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[GEP_A]], i64 4) ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] @@ -570,7 +564,7 @@ exit: define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_then_constant_trip_count( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -610,10 +604,9 @@ define void @deref_assumption_in_then_constant_trip_count(ptr noalias noundef %a ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -664,7 +657,7 @@ exit: define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_latch_constant_trip_count( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -710,10 +703,9 @@ define void @deref_assumption_in_latch_constant_trip_count(ptr noalias noundef % ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i32, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 @@ -866,7 +858,7 @@ exit: define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -887,10 +879,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_1(ptr noali ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -940,7 +931,7 @@ exit: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 3999) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -980,10 +971,9 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_1 ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -1033,7 +1023,7 @@ exit: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4000) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -1054,10 +1044,9 @@ define void @align_and_deref_assumption_in_preheader_constant_trip_count_align_4 ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -1108,7 +1097,7 @@ exit: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr(ptr noalias noundef align 4 %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via_argument_attr( ; CHECK-SAME: ptr noalias noundef align 4 [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -1129,10 +1118,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_known_via ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -1182,7 +1170,7 @@ exit: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4000) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -1222,10 +1210,9 @@ define void @deref_assumption_in_preheader_constant_trip_count_align_4_not_known ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -1275,7 +1262,7 @@ exit: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) nofree nosync{ ; CHECK-LABEL: define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 3999) ] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: @@ -1315,10 +1302,9 @@ define void @deref_assumption_too_small_in_preheader_constant_trip_count_align_4 ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -1369,7 +1355,7 @@ exit: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias noundef %a, ptr noalias %b, ptr noalias %c) { ; CHECK-LABEL: define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias noundef [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ] ; CHECK-NEXT: call void @may_free() ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -1406,10 +1392,9 @@ define void @may_free_align_deref_assumption_in_header_constant_trip_count_loop_ ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 @@ -1459,7 +1444,7 @@ exit: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr(ptr noalias %b, ptr noalias %c) nofree nosync { ; CHECK-LABEL: define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_count_loop_invariant_ptr( ; CHECK-SAME: ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[A:%.*]] = call ptr @get_ptr() ; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[A]], i64 4), "dereferenceable"(ptr [[A]], i64 4) ] ; CHECK-NEXT: call void @may_free() @@ -1497,10 +1482,9 @@ define void @may_free_local_ptr_align_deref_assumption_in_header_constant_trip_c ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[C_1:%.*]] = icmp sge i32 [[L_B]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll index 4f95bddc4b4c..dae2cd3cacd0 100644 --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll @@ -26,10 +26,9 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[RIV]] ; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll index e6a81b6f9f6d..414773cb00d7 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-dead-instructions.ll @@ -87,7 +87,7 @@ exit: define i32 @sink_after_dead_inst(ptr %A.ptr) { ; CHECK-LABEL: define i32 @sink_after_dead_inst( ; CHECK-SAME: ptr [[A_PTR:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -111,12 +111,10 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[FOR]], 15 ; CHECK-NEXT: [[C:%.*]] = icmp eq i1 [[CMP]], true ; CHECK-NEXT: [[VEC_DEAD:%.*]] = and i1 [[C]], true diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll index d95c48717819..c13d3421ba7f 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-interleave-only.ll @@ -4,7 +4,7 @@ define float @for_load_interleave_only(ptr %src) { ; CHECK-LABEL: define float @for_load_interleave_only( ; CHECK-SAME: ptr [[SRC:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -23,14 +23,11 @@ define float @for_load_interleave_only(ptr %src) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi ptr [ [[SRC]], %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[SRC]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[L:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 16 ; CHECK-NEXT: [[L]] = load float, ptr [[PTR_IV]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll index 715ea1c51aba..899c20ab30c6 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-multiply-recurrences.ll @@ -121,14 +121,11 @@ define void @test_pr54223_sink_after_insertion_order(ptr noalias %a, ptr noalias ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT5:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[SCALAR_RECUR6:%.*]] = phi float [ [[SCALAR_RECUR_INIT5]], [[SCALAR_PH]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR6:%.*]] = phi float [ 0.000000e+00, [[SCALAR_PH]] ], [ [[FOR_2_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[NEG:%.*]] = fneg float [[SCALAR_RECUR6]] ; CHECK-NEXT: [[MULADD:%.*]] = call float @llvm.fmuladd.f32(float [[SCALAR_RECUR]], float [[NEG]], float 0.000000e+00) ; CHECK-NEXT: [[DST_GEP:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]] diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 0526f1b4ed1e..10cbf66c783d 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -1195,12 +1195,10 @@ define i64 @constant_folded_previous_value() { ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[SCALAR_BODY:%.*]] ; UNROLL-NO-IC: scalar.body: -; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] -; UNROLL-NO-IC-NEXT: [[VAR2:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-IC-NEXT: [[VAR2:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-IC-NEXT: [[VAR3]] = add i64 0, 1 ; UNROLL-NO-IC-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-IC-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 1000 @@ -1222,12 +1220,10 @@ define i64 @constant_folded_previous_value() { ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[SCALAR_BODY:%.*]] ; UNROLL-NO-VF: scalar.body: -; UNROLL-NO-VF-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VAR2:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VAR2:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[VAR3]] = add i64 0, 1 ; UNROLL-NO-VF-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-VF-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 1000 @@ -1249,12 +1245,10 @@ define i64 @constant_folded_previous_value() { ; SINK-AFTER: middle.block: ; SINK-AFTER-NEXT: br label [[FOR_END:%.*]] ; SINK-AFTER: scalar.ph: -; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[ENTRY]] ] ; SINK-AFTER-NEXT: br label [[SCALAR_BODY:%.*]] ; SINK-AFTER: scalar.body: -; SINK-AFTER-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] -; SINK-AFTER-NEXT: [[VAR2:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ] +; SINK-AFTER-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[SCALAR_BODY]] ] +; SINK-AFTER-NEXT: [[VAR2:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[VAR3:%.*]], [[SCALAR_BODY]] ] ; SINK-AFTER-NEXT: [[VAR3]] = add i64 0, 1 ; SINK-AFTER-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; SINK-AFTER-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], 1000 @@ -3358,12 +3352,10 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; UNROLL-NO-IC-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: -; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[CMP:%.*]] = icmp eq i32 [[FOR]], 15 ; UNROLL-NO-IC-NEXT: [[C:%.*]] = icmp eq i1 [[CMP]], true ; UNROLL-NO-IC-NEXT: [[VEC_DEAD:%.*]] = and i1 [[C]], true @@ -3401,12 +3393,10 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; UNROLL-NO-VF-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-VF: loop: -; UNROLL-NO-VF-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; UNROLL-NO-VF-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ] +; UNROLL-NO-VF-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-VF-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ] ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[FOR]], 15 ; UNROLL-NO-VF-NEXT: [[C:%.*]] = icmp eq i1 [[CMP]], true ; UNROLL-NO-VF-NEXT: [[VEC_DEAD:%.*]] = and i1 [[C]], true @@ -3444,12 +3434,10 @@ define i32 @sink_after_dead_inst(ptr %A.ptr, i32 %n) { ; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2 ; SINK-AFTER-NEXT: br label [[FOR_END:%.*]] ; SINK-AFTER: scalar.ph: -; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] -; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; SINK-AFTER-NEXT: br label [[LOOP:%.*]] ; SINK-AFTER: loop: -; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; SINK-AFTER-NEXT: [[FOR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ] +; SINK-AFTER-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; SINK-AFTER-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[FOR_PREV:%.*]], [[LOOP]] ] ; SINK-AFTER-NEXT: [[CMP:%.*]] = icmp eq i32 [[FOR]], 15 ; SINK-AFTER-NEXT: [[C:%.*]] = icmp eq i1 [[CMP]], true ; SINK-AFTER-NEXT: [[VEC_DEAD:%.*]] = and i1 [[C]], true diff --git a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll index 2c02f839edff..5f4214c5d632 100644 --- a/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ b/llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -68,12 +68,10 @@ define float @minloopattr(ptr nocapture readonly %arg) #0 { ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP4]]) ; CHECK-NEXT: br label [[OUT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[TOP:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[T]], [[TOP]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[T1:%.*]] = phi i64 [ [[T7:%.*]], [[LOOP]] ], [ 1, [[SCALAR_PH]] ] +; CHECK-NEXT: [[T2:%.*]] = phi float [ [[T6:%.*]], [[LOOP]] ], [ [[T]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[T3:%.*]] = getelementptr float, ptr [[ARG]], i64 [[T1]] ; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[T3]], align 4 ; CHECK-NEXT: [[T5:%.*]] = fcmp olt float [[T2]], [[T4]] diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll index b971400c662b..ade90894ba90 100644 --- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll +++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll @@ -69,10 +69,9 @@ define i32 @test(ptr nocapture %f) #0 { ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: -; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; UNROLL-NOSIMPLIFY-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 100 @@ -452,12 +451,10 @@ define void @minimal_bit_widths(i1 %c) { ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 1000, [[ENTRY]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ 0, [[SCALAR_PH]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 1000, [[SCALAR_PH]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr undef, i64 [[TMP1]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[IF_THEN:%.*]], label [[FOR_INC]] @@ -580,12 +577,10 @@ define void @minimal_bit_widths_with_aliasing_store(i1 %c, ptr %ptr) { ; UNROLL-NOSIMPLIFY: middle.block: ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NOSIMPLIFY: scalar.ph: -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i64 [ 1000, [[ENTRY]] ] ; UNROLL-NOSIMPLIFY-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NOSIMPLIFY: for.body: -; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = phi i64 [ [[TMP9:%.*]], [[FOR_INC:%.*]] ], [ 0, [[SCALAR_PH]] ] +; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP7:%.*]], [[FOR_INC]] ], [ 1000, [[SCALAR_PH]] ] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP1]] ; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 ; UNROLL-NOSIMPLIFY-NEXT: store i8 0, ptr [[TMP3]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll index 9cff1cfae0b1..ff7594a5d3a8 100644 --- a/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction-multiple-uses-in-same-instruction.ll @@ -26,10 +26,9 @@ define void @multiple_iv_uses_in_same_instruction(ptr %ptr) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [100 x [100 x i32]], ptr [[PTR]], i64 0, i64 [[IV]], i64 [[IV]] ; CHECK-NEXT: [[T:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: store i32 [[T]], ptr [[GEP]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/induction-step.ll b/llvm/test/Transforms/LoopVectorize/induction-step.ll index 59f6e8b04f54..0dab78039ea6 100644 --- a/llvm/test/Transforms/LoopVectorize/induction-step.ll +++ b/llvm/test/Transforms/LoopVectorize/induction-step.ll @@ -276,7 +276,7 @@ for.end: define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK-LABEL: define void @iv_no_binary_op_in_descriptor( ; CHECK-SAME: i1 [[C:%.*]], ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -292,10 +292,9 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT_P:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store i64 [[IV]], ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT:%.*]] = add i64 [[IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 77b91ccb913c..343facb2ef69 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -2747,12 +2747,10 @@ define i32 @i8_loop() nounwind readnone ssp uwtable { ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[B_0:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[B_0:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; CHECK-NEXT: [[B_NEXT]] = add i8 [[B_0]], -1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i8 [[B_NEXT]], 0 @@ -2818,12 +2816,10 @@ define i32 @i8_loop() nounwind readnone ssp uwtable { ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: br label [[EXIT:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: -; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[B_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; UNROLL-NO-IC-NEXT: [[B_NEXT]] = add i8 [[B_0]], -1 ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i8 [[B_NEXT]], 0 @@ -2884,12 +2880,10 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[TMP0]]) ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[B_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_0_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[B_0:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[B_0_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; CHECK-NEXT: [[B_0_NEXT]] = add i16 [[B_0]], -1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i16 [[B_0_NEXT]], 0 @@ -2955,12 +2949,10 @@ define i32 @i16_loop() nounwind readnone ssp uwtable { ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: br label [[EXIT:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: -; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[B_0_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[A_0:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[A_0_AND:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[B_0:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[B_0_NEXT:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[A_0_AND]] = and i32 [[A_0]], 4 ; UNROLL-NO-IC-NEXT: [[B_0_NEXT]] = add i16 [[B_0]], -1 ; UNROLL-NO-IC-NEXT: [[EC:%.*]] = icmp eq i16 [[B_0_NEXT]], 0 @@ -5017,12 +5009,10 @@ define i32 @PR32419(i32 %a, i16 %b) { ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP15]]) ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -20, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] -; CHECK-NEXT: [[VAR0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR6:%.*]], [[FOR_INC]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ -20, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[VAR0:%.*]] = phi i32 [ [[A]], [[SCALAR_PH]] ], [ [[VAR6:%.*]], [[FOR_INC]] ] ; CHECK-NEXT: [[VAR1:%.*]] = trunc i32 [[I]] to i16 ; CHECK-NEXT: [[VAR2:%.*]] = icmp eq i16 [[VAR1]], 0 ; CHECK-NEXT: br i1 [[VAR2]], label [[FOR_INC]], label [[FOR_COND:%.*]] @@ -5237,12 +5227,10 @@ define i32 @PR32419(i32 %a, i16 %b) { ; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]]) ; UNROLL-NO-IC-NEXT: br label [[FOR_END:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -20, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]] ; UNROLL-NO-IC: for.body: -; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] -; UNROLL-NO-IC-NEXT: [[VAR0:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR6:%.*]], [[FOR_INC]] ] +; UNROLL-NO-IC-NEXT: [[I:%.*]] = phi i32 [ -20, [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[FOR_INC:%.*]] ] +; UNROLL-NO-IC-NEXT: [[VAR0:%.*]] = phi i32 [ [[A]], [[SCALAR_PH]] ], [ [[VAR6:%.*]], [[FOR_INC]] ] ; UNROLL-NO-IC-NEXT: [[VAR1:%.*]] = trunc i32 [[I]] to i16 ; UNROLL-NO-IC-NEXT: [[VAR2:%.*]] = icmp eq i16 [[VAR1]], 0 ; UNROLL-NO-IC-NEXT: br i1 [[VAR2]], label [[FOR_INC]], label [[FOR_COND:%.*]] @@ -5833,14 +5821,11 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr % ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RECUR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[LV]], [[RECUR]] ; CHECK-NEXT: [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1 @@ -5955,14 +5940,11 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr % ; UNROLL-NO-IC: middle.block: ; UNROLL-NO-IC-NEXT: br label [[EXIT:%.*]] ; UNROLL-NO-IC: scalar.ph: -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[ENTRY]] ] -; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; UNROLL-NO-IC-NEXT: br label [[LOOP:%.*]] ; UNROLL-NO-IC: loop: -; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] -; UNROLL-NO-IC-NEXT: [[RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[TRUNC_IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[TRUNC_IV_NEXT:%.*]], [[LOOP]] ] +; UNROLL-NO-IC-NEXT: [[RECUR:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ] ; UNROLL-NO-IC-NEXT: [[LV:%.*]] = load i32, ptr [[SRC]], align 4 ; UNROLL-NO-IC-NEXT: [[MUL:%.*]] = mul nsw i32 [[LV]], [[RECUR]] ; UNROLL-NO-IC-NEXT: [[TRUNC_IV_NEXT]] = add i32 [[TRUNC_IV]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll index 3330f2bfe661..acd10a57e0ce 100644 --- a/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll +++ b/llvm/test/Transforms/LoopVectorize/instruction-only-used-outside-of-loop.ll @@ -20,10 +20,9 @@ define i32 @one_direct_branch(ptr %src) { ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 25500, [[LV]] @@ -76,10 +75,9 @@ define i32 @two_direct_branch(ptr %src) { ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 25500, [[LV]] @@ -145,10 +143,9 @@ define i32 @cond_branch(i32 %a, ptr %src) { ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i32 3 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: [[LV:%.*]] = load i32, ptr [[SRC_GEP]], align 4 ; CHECK-NEXT: [[XOR:%.*]] = xor i32 25500, [[LV]] @@ -210,10 +207,9 @@ define i32 @optimizable_trunc_used_outside() { ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 diff --git a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll index 8a4820949af1..3eb16e9a2d78 100644 --- a/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/interleave-with-i65-induction.ll @@ -5,7 +5,7 @@ define void @i65_induction_with_negative_step(ptr %dst) { ; CHECK-LABEL: define void @i65_induction_with_negative_step( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -34,14 +34,11 @@ define void @i65_induction_with_negative_step(ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i65 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV_I65:%.*]] = phi i65 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[IV_I65_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[TRUNC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV_I65:%.*]] = phi i65 [ 0, %[[SCALAR_PH]] ], [ [[IV_I65_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[TRUNC:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[TRUNC]] = trunc i65 [[IV_I65]] to i64 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TRUNC]] ; CHECK-NEXT: store i64 [[FOR]], ptr [[GEP]], align 8 diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll index 651210df823d..fa339f45fcdd 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32" define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr noalias %dst) { ; CHECK-LABEL: define void @gep_for_first_member_does_not_dominate_insert_point( ; CHECK-SAME: ptr [[STR:%.*]], ptr noalias [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -28,12 +28,10 @@ define void @gep_for_first_member_does_not_dominate_insert_point(ptr %str, ptr n ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV2:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[OR_1:%.*]] = or disjoint i64 [[IV2]], 1 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[STR]], i64 [[OR_1]] ; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[GEP1]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll index 65148b0babcd..4fe7c97ccd66 100644 --- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-metadata.ll @@ -14,7 +14,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" define void @merge_tbaa_interleave_group(ptr nocapture readonly %p, ptr noalias %cp, i32 %i) ; CHECK-LABEL: define void @merge_tbaa_interleave_group( ; CHECK-SAME: ptr readonly captures(none) [[P:%.*]], ptr noalias [[CP:%.*]], i32 [[I:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -46,10 +46,9 @@ define void @merge_tbaa_interleave_group(ptr nocapture readonly %p, ptr noalias ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], ptr [[P]], i64 [[IV]], i32 0 ; CHECK-NEXT: [[TMP19:%.*]] = load double, ptr [[X]], align 8, !tbaa [[TBAA0]] ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP19]], 2.000000e+00 diff --git a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll index 6eeeace80aa8..42f6c0532caa 100644 --- a/llvm/test/Transforms/LoopVectorize/is_fpclass.ll +++ b/llvm/test/Transforms/LoopVectorize/is_fpclass.ll @@ -22,10 +22,9 @@ define void @d() { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I7:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[I7:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[I3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[I4:%.*]] = getelementptr float, ptr @d, i64 [[I]] ; CHECK-NEXT: [[I5:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[I3]], i32 0) diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll index 1ad1094fe236..615f50124b41 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll @@ -6,7 +6,7 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start( ; IC1VF4-SAME: ptr [[A:%.*]]) { -; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[ENTRY:.*:]] ; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC1VF4: [[VECTOR_PH]]: ; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -32,12 +32,10 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331 ; IC1VF4-NEXT: br label %[[EXIT:.*]] ; IC1VF4: [[SCALAR_PH]]: -; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 19999, %[[ENTRY]] ] -; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ] ; IC1VF4-NEXT: br label %[[LOOP:.*]] ; IC1VF4: [[LOOP]]: -; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 19999, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 ; IC1VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 @@ -51,7 +49,7 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; ; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_const_start( ; IC4VF4-SAME: ptr [[A:%.*]]) { -; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: [[ENTRY:.*:]] ; IC4VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC4VF4: [[VECTOR_PH]]: ; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -104,12 +102,10 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331 ; IC4VF4-NEXT: br label %[[EXIT:.*]] ; IC4VF4: [[SCALAR_PH]]: -; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 19999, %[[ENTRY]] ] -; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ] ; IC4VF4-NEXT: br label %[[LOOP:.*]] ; IC4VF4: [[LOOP]]: -; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 19999, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 ; IC4VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 @@ -123,7 +119,7 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; ; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_const_start( ; IC4VF1-SAME: ptr [[A:%.*]]) { -; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: [[ENTRY:.*:]] ; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC4VF1: [[VECTOR_PH]]: ; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -164,12 +160,10 @@ define i64 @select_decreasing_induction_icmp_const_start(ptr %a) { ; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331 ; IC4VF1-NEXT: br label %[[EXIT:.*]] ; IC4VF1: [[SCALAR_PH]]: -; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 19999, %[[ENTRY]] ] -; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ] ; IC4VF1-NEXT: br label %[[LOOP:.*]] ; IC4VF1: [[LOOP]]: -; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ 19999, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 ; IC4VF1-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 @@ -204,7 +198,7 @@ exit: ; preds = %loop define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) { ; IC1VF4-LABEL: define i16 @select_decreasing_induction_icmp_table_i16( ; IC1VF4-SAME: i16 noundef [[VAL:%.*]]) { -; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[ENTRY:.*:]] ; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC1VF4: [[VECTOR_PH]]: ; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[VAL]], i64 0 @@ -234,12 +228,10 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) { ; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0 ; IC1VF4-NEXT: br label %[[EXIT:.*]] ; IC1VF4: [[SCALAR_PH]]: -; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ] -; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; IC1VF4-NEXT: br label %[[LOOP:.*]] ; IC1VF4: [[LOOP]]: -; IC1VF4-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC1VF4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC1VF4-NEXT: [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]] ; IC1VF4-NEXT: [[LD_TABLE:%.*]] = load i16, ptr [[GEP_TABLE_IV]], align 1 ; IC1VF4-NEXT: [[CMP_TABLE_VAL:%.*]] = icmp ugt i16 [[LD_TABLE]], [[VAL]] @@ -486,7 +478,7 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) { ; ; IC4VF1-LABEL: define i16 @select_decreasing_induction_icmp_table_i16( ; IC4VF1-SAME: i16 noundef [[VAL:%.*]]) { -; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: [[ENTRY:.*:]] ; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC4VF1: [[VECTOR_PH]]: ; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -532,12 +524,10 @@ define i16 @select_decreasing_induction_icmp_table_i16(i16 noundef %val) { ; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[RDX_MINMAX5]], i16 0 ; IC4VF1-NEXT: br label %[[EXIT:.*]] ; IC4VF1: [[SCALAR_PH]]: -; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ] -; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; IC4VF1-NEXT: br label %[[LOOP:.*]] ; IC4VF1: [[LOOP]]: -; IC4VF1-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC4VF1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC4VF1-NEXT: [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]] ; IC4VF1-NEXT: [[LD_TABLE:%.*]] = load i16, ptr [[GEP_TABLE_IV]], align 1 ; IC4VF1-NEXT: [[CMP_TABLE_VAL:%.*]] = icmp ugt i16 [[LD_TABLE]], [[VAL]] @@ -573,7 +563,7 @@ exit: ; preds = %loop define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) { ; IC1VF4-LABEL: define i16 @select_decreasing_induction_icmp_table_half( ; IC1VF4-SAME: half noundef [[VAL:%.*]]) { -; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[ENTRY:.*:]] ; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC1VF4: [[VECTOR_PH]]: ; IC1VF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x half> poison, half [[VAL]], i64 0 @@ -603,12 +593,10 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) { ; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[TMP7]], i16 0 ; IC1VF4-NEXT: br label %[[EXIT:.*]] ; IC1VF4: [[SCALAR_PH]]: -; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ] -; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; IC1VF4-NEXT: br label %[[LOOP:.*]] ; IC1VF4: [[LOOP]]: -; IC1VF4-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC1VF4-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC1VF4-NEXT: [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]] ; IC1VF4-NEXT: [[LD_TABLE:%.*]] = load half, ptr [[GEP_TABLE_IV]], align 1 ; IC1VF4-NEXT: [[CMP_TABLE_VAL:%.*]] = fcmp ugt half [[LD_TABLE]], [[VAL]] @@ -855,7 +843,7 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) { ; ; IC4VF1-LABEL: define i16 @select_decreasing_induction_icmp_table_half( ; IC4VF1-SAME: half noundef [[VAL:%.*]]) { -; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: [[ENTRY:.*:]] ; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC4VF1: [[VECTOR_PH]]: ; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -901,12 +889,10 @@ define i16 @select_decreasing_induction_icmp_table_half(half noundef %val) { ; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i16 [[RDX_MINMAX5]], i16 0 ; IC4VF1-NEXT: br label %[[EXIT:.*]] ; IC4VF1: [[SCALAR_PH]]: -; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 12, %[[ENTRY]] ] -; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; IC4VF1-NEXT: br label %[[LOOP:.*]] ; IC4VF1: [[LOOP]]: -; IC4VF1-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC4VF1-NEXT: [[RDX:%.*]] = phi i16 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV:%.*]] = phi i16 [ 12, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC4VF1-NEXT: [[GEP_TABLE_IV:%.*]] = getelementptr inbounds [13 x i16], ptr @table, i16 0, i16 [[IV]] ; IC4VF1-NEXT: [[LD_TABLE:%.*]] = load half, ptr [[GEP_TABLE_IV]], align 1 ; IC4VF1-NEXT: [[CMP_TABLE_VAL:%.*]] = fcmp ugt half [[LD_TABLE]], [[VAL]] @@ -943,7 +929,7 @@ exit: ; preds = %loop define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { ; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned( ; IC1VF4-SAME: ptr [[A:%.*]]) { -; IC1VF4-NEXT: [[ENTRY:.*]]: +; IC1VF4-NEXT: [[ENTRY:.*:]] ; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC1VF4: [[VECTOR_PH]]: ; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -969,12 +955,10 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { ; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331 ; IC1VF4-NEXT: br label %[[EXIT:.*]] ; IC1VF4: [[SCALAR_PH]]: -; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ] -; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ] ; IC1VF4-NEXT: br label %[[LOOP:.*]] ; IC1VF4: [[LOOP]]: -; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 ; IC1VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 @@ -988,7 +972,7 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { ; ; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned( ; IC4VF4-SAME: ptr [[A:%.*]]) { -; IC4VF4-NEXT: [[ENTRY:.*]]: +; IC4VF4-NEXT: [[ENTRY:.*:]] ; IC4VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC4VF4: [[VECTOR_PH]]: ; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1041,12 +1025,10 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { ; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331 ; IC4VF4-NEXT: br label %[[EXIT:.*]] ; IC4VF4: [[SCALAR_PH]]: -; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ] -; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ] ; IC4VF4-NEXT: br label %[[LOOP:.*]] ; IC4VF4: [[LOOP]]: -; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 ; IC4VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 @@ -1060,7 +1042,7 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { ; ; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned( ; IC4VF1-SAME: ptr [[A:%.*]]) { -; IC4VF1-NEXT: [[ENTRY:.*]]: +; IC4VF1-NEXT: [[ENTRY:.*:]] ; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC4VF1: [[VECTOR_PH]]: ; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1101,12 +1083,10 @@ define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { ; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331 ; IC4VF1-NEXT: br label %[[EXIT:.*]] ; IC4VF1: [[SCALAR_PH]]: -; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 9223372036854775807, %[[ENTRY]] ] -; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 331, %[[ENTRY]] ] ; IC4VF1-NEXT: br label %[[LOOP:.*]] ; IC4VF1: [[LOOP]]: -; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ 9223372036854775807, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ] ; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8 ; IC4VF1-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll index 1054482fb80d..80c5bb359cb4 100644 --- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll +++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-trunc.ll @@ -240,7 +240,7 @@ exit: ; preds = %for.body, %entry define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] ; CHECK-VF4IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4IC1: [[VECTOR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -262,12 +262,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) { ; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 331 ; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC1: [[SCALAR_PH]]: -; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP7]], 3 @@ -282,7 +280,7 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) { ; ; CHECK-VF4IC4-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] ; CHECK-VF4IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4IC4: [[VECTOR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -325,12 +323,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) { ; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 331 ; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC4: [[SCALAR_PH]]: -; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP16]], 3 @@ -345,7 +341,7 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) { ; ; CHECK-VF1IC4-LABEL: define i32 @select_icmp_const_truncated_iv_const_exit( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[ENTRY:.*:]] ; CHECK-VF1IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF1IC4: [[VECTOR_PH]]: ; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -389,12 +385,10 @@ define i32 @select_icmp_const_truncated_iv_const_exit(ptr %a) { ; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 331 ; CHECK-VF1IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF1IC4: [[SCALAR_PH]]: -; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ] ; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF1IC4: [[FOR_BODY]]: -; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 ; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = icmp sgt i64 [[TMP26]], 3 @@ -431,7 +425,7 @@ exit: ; preds = %for.body define i32 @select_fcmp_max_valid_const_ub(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @select_fcmp_max_valid_const_ub( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] ; CHECK-VF4IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4IC1: [[VECTOR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -453,12 +447,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) { ; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP6]], i32 -1 ; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC1: [[SCALAR_PH]]: -; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP7]], 0.000000e+00 @@ -473,7 +465,7 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) { ; ; CHECK-VF4IC4-LABEL: define i32 @select_fcmp_max_valid_const_ub( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] ; CHECK-VF4IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4IC4: [[VECTOR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -516,12 +508,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) { ; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP15]], i32 -1 ; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC4: [[SCALAR_PH]]: -; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP16]], 0.000000e+00 @@ -536,7 +526,7 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) { ; ; CHECK-VF1IC4-LABEL: define i32 @select_fcmp_max_valid_const_ub( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[ENTRY:.*:]] ; CHECK-VF1IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF1IC4: [[VECTOR_PH]]: ; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -580,12 +570,10 @@ define i32 @select_fcmp_max_valid_const_ub(ptr %a) { ; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX5]], i32 -1 ; CHECK-VF1IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF1IC4: [[SCALAR_PH]]: -; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ -1, %[[ENTRY]] ] ; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF1IC4: [[FOR_BODY]]: -; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ -1, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[IV]] ; CHECK-VF1IC4-NEXT: [[TMP26:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = fcmp fast olt float [[TMP26]], 0.000000e+00 @@ -626,7 +614,7 @@ exit: ; preds = %for.body define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) { ; CHECK-VF4IC1-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range( ; CHECK-VF4IC1-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC1-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC1-NEXT: [[ENTRY:.*:]] ; CHECK-VF4IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4IC1: [[VECTOR_PH]]: ; CHECK-VF4IC1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -649,12 +637,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) { ; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP5]], i32 331 ; CHECK-VF4IC1-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC1: [[SCALAR_PH]]: -; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ] -; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ] ; CHECK-VF4IC1-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC1: [[FOR_BODY]]: -; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[IV1:%.*]] = phi i64 [ 2147483646, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC1-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC1-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV1]] ; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-VF4IC1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP0]], 3 @@ -669,7 +655,7 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) { ; ; CHECK-VF4IC4-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range( ; CHECK-VF4IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF4IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF4IC4-NEXT: [[ENTRY:.*:]] ; CHECK-VF4IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF4IC4: [[VECTOR_PH]]: ; CHECK-VF4IC4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -713,12 +699,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) { ; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP14]], i32 331 ; CHECK-VF4IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF4IC4: [[SCALAR_PH]]: -; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ] -; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ] ; CHECK-VF4IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF4IC4: [[FOR_BODY]]: -; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ 2147483646, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF4IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-VF4IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP15]], 3 @@ -733,7 +717,7 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) { ; ; CHECK-VF1IC4-LABEL: define i32 @select_icmp_truncated_unsigned_iv_range( ; CHECK-VF1IC4-SAME: ptr [[A:%.*]]) { -; CHECK-VF1IC4-NEXT: [[ENTRY:.*]]: +; CHECK-VF1IC4-NEXT: [[ENTRY:.*:]] ; CHECK-VF1IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK-VF1IC4: [[VECTOR_PH]]: ; CHECK-VF1IC4-NEXT: br label %[[VECTOR_BODY:.*]] @@ -779,12 +763,10 @@ define i32 @select_icmp_truncated_unsigned_iv_range(ptr %a) { ; CHECK-VF1IC4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[RDX_MINMAX6]], i32 331 ; CHECK-VF1IC4-NEXT: br label %[[EXIT:.*]] ; CHECK-VF1IC4: [[SCALAR_PH]]: -; CHECK-VF1IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 2147483646, %[[ENTRY]] ] -; CHECK-VF1IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 331, %[[ENTRY]] ] ; CHECK-VF1IC4-NEXT: br label %[[FOR_BODY:.*]] ; CHECK-VF1IC4: [[FOR_BODY]]: -; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[IV:%.*]] = phi i64 [ 2147483646, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-VF1IC4-NEXT: [[RDX:%.*]] = phi i32 [ 331, %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[FOR_BODY]] ] ; CHECK-VF1IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] ; CHECK-VF1IC4-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-VF1IC4-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP24]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll index 97d33858bd83..766e7acdfd1c 100644 --- a/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll +++ b/llvm/test/Transforms/LoopVectorize/iv_outside_user.ll @@ -91,7 +91,7 @@ for.end: define i32 @constpre() { ; CHECK-LABEL: define i32 @constpre() { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -103,10 +103,9 @@ define i32 @constpre() { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INC_PHI:%.*]] = phi i32 [ 32, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = sub nsw i32 [[INC_PHI]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC]], 0 ; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_END]], label %[[FOR_BODY]], {{!llvm.loop ![0-9]+}} @@ -130,7 +129,7 @@ for.end: define ptr @geppre(ptr %ptr) { ; CHECK-LABEL: define ptr @geppre( ; CHECK-SAME: ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[PTR]], i64 512 @@ -144,12 +143,10 @@ define ptr @geppre(ptr %ptr) { ; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -16 ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi ptr [ [[PTR]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[PTR_PHI:%.*]] = phi ptr [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[INC_PTR:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INC_PHI:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[PTR_PHI:%.*]] = phi ptr [ [[PTR]], %[[SCALAR_PH]] ], [ [[INC_PTR:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[INC]] = add nsw i32 [[INC_PHI]], 1 ; CHECK-NEXT: [[INC_PTR]] = getelementptr i32, ptr [[PTR_PHI]], i32 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC]], 32 @@ -399,7 +396,7 @@ BB4: define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) { ; VEC-LABEL: define i64 @iv_scalar_steps_and_outside_users( ; VEC-SAME: ptr [[PTR:%.*]]) { -; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[ENTRY:.*:]] ; VEC-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VEC: [[VECTOR_PH]]: ; VEC-NEXT: br label %[[VECTOR_BODY:.*]] @@ -415,10 +412,9 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) { ; VEC: [[MIDDLE_BLOCK]]: ; VEC-NEXT: br label %[[EXIT:.*]] ; VEC: [[SCALAR_PH]]: -; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VEC-NEXT: br label %[[LOOP:.*]] ; VEC: [[LOOP]]: -; VEC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; VEC-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; VEC-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]] ; VEC-NEXT: store i64 [[IV]], ptr [[GEP_PTR]], align 4 @@ -430,7 +426,7 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) { ; ; INTERLEAVE-LABEL: define i64 @iv_scalar_steps_and_outside_users( ; INTERLEAVE-SAME: ptr [[PTR:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -447,10 +443,9 @@ define i64 @iv_scalar_steps_and_outside_users(ptr %ptr) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; INTERLEAVE-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]] ; INTERLEAVE-NEXT: store i64 [[IV]], ptr [[GEP_PTR]], align 4 @@ -481,7 +476,7 @@ exit: define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) { ; VEC-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside( ; VEC-SAME: ptr [[PTR:%.*]]) { -; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[ENTRY:.*:]] ; VEC-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VEC: [[VECTOR_PH]]: ; VEC-NEXT: br label %[[VECTOR_BODY:.*]] @@ -497,12 +492,10 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) { ; VEC: [[MIDDLE_BLOCK]]: ; VEC-NEXT: br label %[[EXIT:.*]] ; VEC: [[SCALAR_PH]]: -; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; VEC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; VEC-NEXT: br label %[[LOOP:.*]] ; VEC: [[LOOP]]: -; VEC-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; VEC-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; VEC-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; VEC-NEXT: [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2 ; VEC-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]] @@ -515,7 +508,7 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) { ; ; INTERLEAVE-LABEL: define i32 @iv_2_dead_in_loop_only_used_outside( ; INTERLEAVE-SAME: ptr [[PTR:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -532,12 +525,10 @@ define i32 @iv_2_dead_in_loop_only_used_outside(ptr %ptr) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; INTERLEAVE-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV_2:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; INTERLEAVE-NEXT: [[IV_2_NEXT]] = add nuw i32 [[IV_2]], 2 ; INTERLEAVE-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 [[IV]] @@ -1083,7 +1074,7 @@ exit: define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification( ; VEC-SAME: ptr [[DST:%.*]]) { -; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[ENTRY:.*:]] ; VEC-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32 ; VEC-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1 ; VEC-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -1102,10 +1093,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; VEC: [[MIDDLE_BLOCK]]: ; VEC-NEXT: br label %[[E_EXIT:.*]] ; VEC: [[SCALAR_PH]]: -; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; VEC-NEXT: br label %[[LOOP:.*]] ; VEC: [[LOOP]]: -; VEC-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; VEC-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]] ; VEC-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VEC-NEXT: [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]] @@ -1117,7 +1107,7 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; ; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification( ; INTERLEAVE-SAME: ptr [[DST:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32 ; INTERLEAVE-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1 ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -1137,10 +1127,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification(ptr %dst) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[E_EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]] ; INTERLEAVE-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; INTERLEAVE-NEXT: [[IV_NEXT]] = add i32 [[STEP_2]], [[IV]] @@ -1171,7 +1160,7 @@ e.exit: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; VEC-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2( ; VEC-SAME: ptr [[DST:%.*]]) { -; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[ENTRY:.*:]] ; VEC-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32 ; VEC-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1 ; VEC-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -1199,10 +1188,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; VEC-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 ; VEC-NEXT: br label %[[E_EXIT:.*]] ; VEC: [[SCALAR_PH]]: -; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; VEC-NEXT: br label %[[LOOP:.*]] ; VEC: [[LOOP]]: -; VEC-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; VEC-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]] ; VEC-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; VEC-NEXT: [[INC:%.*]] = add i32 [[IV]], 1 @@ -1215,7 +1203,7 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; ; INTERLEAVE-LABEL: define i32 @test_iv_uniform_with_outside_use_scev_simplification_2( ; INTERLEAVE-SAME: ptr [[DST:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: [[STEP_1:%.*]] = sext i8 0 to i32 ; INTERLEAVE-NEXT: [[STEP_2:%.*]] = add nsw i32 [[STEP_1]], 1 ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -1237,10 +1225,9 @@ define i32 @test_iv_uniform_with_outside_use_scev_simplification_2(ptr %dst) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[E_EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i32 [[IV]] ; INTERLEAVE-NEXT: store i16 0, ptr [[GEP_DST]], align 2 ; INTERLEAVE-NEXT: [[INC:%.*]] = add i32 [[IV]], 1 @@ -1363,7 +1350,7 @@ exit: define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC-LABEL: define i64 @test_iv_increment_incremented( ; VEC-SAME: ptr [[DST:%.*]]) { -; VEC-NEXT: [[ENTRY:.*]]: +; VEC-NEXT: [[ENTRY:.*:]] ; VEC-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VEC: [[VECTOR_PH]]: ; VEC-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1378,12 +1365,10 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; VEC: [[MIDDLE_BLOCK]]: ; VEC-NEXT: br label %[[EXIT:.*]] ; VEC: [[SCALAR_PH]]: -; VEC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[ENTRY]] ] -; VEC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 2, %[[ENTRY]] ] ; VEC-NEXT: br label %[[LOOP:.*]] ; VEC: [[LOOP]]: -; VEC-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; VEC-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV_1:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; VEC-NEXT: [[IV_2:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; VEC-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV_1]] ; VEC-NEXT: store i16 1, ptr [[GEP]], align 2 ; VEC-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1 @@ -1396,7 +1381,7 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; ; INTERLEAVE-LABEL: define i64 @test_iv_increment_incremented( ; INTERLEAVE-SAME: ptr [[DST:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -1411,12 +1396,10 @@ define i64 @test_iv_increment_incremented(ptr %dst) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, %[[ENTRY]] ] -; INTERLEAVE-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 2, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] -; INTERLEAVE-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV_1:%.*]] = phi i64 [ 3, %[[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV_2:%.*]] = phi i64 [ 2, %[[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i64 [[IV_1]] ; INTERLEAVE-NEXT: store i16 1, ptr [[GEP]], align 2 ; INTERLEAVE-NEXT: [[IV_2_NEXT]] = add i64 [[IV_2]], -1 diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll index 2c7d1bd3a134..b7b67c263d44 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-align.ll @@ -50,12 +50,10 @@ define i16 @test_access_size_not_multiple_of_align(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP17:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP15]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1 @@ -146,12 +144,10 @@ define i32 @test_access_size_multiple_of_align_but_offset_by_1(i64 %len, ptr %te ; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP15]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1 @@ -376,10 +372,9 @@ define void @test_rev_loops_deref_loads(ptr nocapture noundef writeonly %dest) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1023, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]] ; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP19]], 3 @@ -488,10 +483,9 @@ define void @test_rev_loops_non_deref_loads(ptr nocapture noundef writeonly %des ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1023, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[OFF:%.*]] = add i64 [[IV]], -1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[OFF]] ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 @@ -582,12 +576,10 @@ define i16 @test_strided_access(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> [[TMP13]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[IV]] ; CHECK-NEXT: [[L_T:%.*]] = load i8, ptr [[TEST_ADDR]], align 1 @@ -691,10 +683,9 @@ define void @test_rev_loops_strided_deref_loads(ptr nocapture noundef writeonly ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 511, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 511, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr [[LOCAL_CMP]], i64 0, i64 [[IV]] ; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP21]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll index 34c04de22755..468e6823e9b5 100644 --- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll +++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-neg-off.ll @@ -54,12 +54,10 @@ define i8 @test_negative_off(i16 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> [[TMP18]]) ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ -1000, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ -1000, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ACCUM:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LATCH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i1, ptr [[TEST_BASE]], i16 [[IV]] ; CHECK-NEXT: [[EARLYCND:%.*]] = load i1, ptr [[TEST_ADDR]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll index f99e883c045d..fbe57c81053f 100644 --- a/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/load-of-struct-deref-pred.ll @@ -31,10 +31,9 @@ define void @accesses_to_struct_dereferenceable(ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: [[D:%.*]] = load i32, ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 [[D]], 0 @@ -270,10 +269,9 @@ define void @accesses_to_struct_may_not_be_dereferenceable_access_size(ptr noali ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: [[D:%.*]] = load i32, ptr [[GEP_DST]], align 4 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i32 [[D]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll b/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll index 14a091feb58b..3190d239e047 100644 --- a/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll +++ b/llvm/test/Transforms/LoopVectorize/make-followup-loop-id.ll @@ -15,7 +15,7 @@ define void @f(ptr noundef captures(none) %a, float noundef %x) { ; CHECK-LABEL: define void @f( ; CHECK-SAME: ptr noundef captures(none) [[A:%.*]], float noundef [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[X]], i64 0 @@ -68,10 +68,9 @@ define void @f(ptr noundef captures(none) %a, float noundef %x) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT_7:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[X]], [[LOAD]] diff --git a/llvm/test/Transforms/LoopVectorize/metadata.ll b/llvm/test/Transforms/LoopVectorize/metadata.ll index ce9c62408633..54779ed55cff 100644 --- a/llvm/test/Transforms/LoopVectorize/metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/metadata.ll @@ -126,7 +126,7 @@ exit: define void @widen_call_range(ptr noalias %a, ptr readonly %b) { ; CHECK-LABEL: define void @widen_call_range( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -143,10 +143,9 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !range [[RNG9:![0-9]+]] ; CHECK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]], !range [[RNG9]] @@ -160,7 +159,7 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) { ; ; INTERLEAVE-LABEL: define void @widen_call_range( ; INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -182,10 +181,9 @@ define void @widen_call_range(ptr noalias %a, ptr readonly %b) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]] ; INTERLEAVE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4, !tbaa [[TBAA0]], !range [[RNG9:![0-9]+]] ; INTERLEAVE-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]], !range [[RNG9]] @@ -218,7 +216,7 @@ exit: define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) { ; CHECK-LABEL: define void @widen_call_fpmath( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -235,10 +233,9 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]] ; CHECK-NEXT: [[CALL:%.*]] = call double @bar(double [[LOAD]]) #[[ATTR2:[0-9]+]], !fpmath [[META3]] @@ -252,7 +249,7 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) { ; ; INTERLEAVE-LABEL: define void @widen_call_fpmath( ; INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -274,10 +271,9 @@ define void @widen_call_fpmath(ptr noalias %a, ptr readonly %b) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]] ; INTERLEAVE-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]] ; INTERLEAVE-NEXT: [[CALL:%.*]] = call double @bar(double [[LOAD]]) #[[ATTR2:[0-9]+]], !fpmath [[META3]] @@ -310,7 +306,7 @@ exit: define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) { ; CHECK-LABEL: define void @widen_intrinsic( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -327,10 +323,9 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 ; CHECK-NEXT: [[CALL:%.*]] = call i64 @llvm.abs.i64(i64 [[LOAD]], i1 true), !range [[RNG9]] @@ -344,7 +339,7 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) { ; ; INTERLEAVE-LABEL: define void @widen_intrinsic( ; INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -366,10 +361,9 @@ define void @widen_intrinsic(ptr noalias %a, ptr readonly %b) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[IV]] ; INTERLEAVE-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 ; INTERLEAVE-NEXT: [[CALL:%.*]] = call i64 @llvm.abs.i64(i64 [[LOAD]], i1 true), !range [[RNG9]] @@ -402,7 +396,7 @@ exit: define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) { ; CHECK-LABEL: define void @widen_intrinsic_fpmath( ; CHECK-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -419,10 +413,9 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]] ; CHECK-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]] ; CHECK-NEXT: [[CALL:%.*]] = call double @llvm.sin.f64(double [[LOAD]]) #[[ATTR2]], !fpmath [[META3]] @@ -436,7 +429,7 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) { ; ; INTERLEAVE-LABEL: define void @widen_intrinsic_fpmath( ; INTERLEAVE-SAME: ptr noalias [[A:%.*]], ptr readonly [[B:%.*]]) { -; INTERLEAVE-NEXT: [[ENTRY:.*]]: +; INTERLEAVE-NEXT: [[ENTRY:.*:]] ; INTERLEAVE-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; INTERLEAVE: [[VECTOR_PH]]: ; INTERLEAVE-NEXT: br label %[[VECTOR_BODY:.*]] @@ -458,10 +451,9 @@ define void @widen_intrinsic_fpmath(ptr noalias %a, ptr readonly %b) { ; INTERLEAVE: [[MIDDLE_BLOCK]]: ; INTERLEAVE-NEXT: br label %[[EXIT:.*]] ; INTERLEAVE: [[SCALAR_PH]]: -; INTERLEAVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; INTERLEAVE-NEXT: br label %[[LOOP:.*]] ; INTERLEAVE: [[LOOP]]: -; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; INTERLEAVE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; INTERLEAVE-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[IV]] ; INTERLEAVE-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8, !tbaa [[TBAA0]] ; INTERLEAVE-NEXT: [[CALL:%.*]] = call double @llvm.sin.f64(double [[LOAD]]) #[[ATTR2]], !fpmath [[META3]] diff --git a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll index e26fef4f02ee..536dffa920c6 100644 --- a/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/minimumnum-maximumnum-reductions.ll @@ -5,7 +5,7 @@ define float @maximumnum_intrinsic(ptr readonly %x) { ; CHECK-LABEL: define float @maximumnum_intrinsic( ; CHECK-SAME: ptr readonly [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -27,12 +27,10 @@ define float @maximumnum_intrinsic(ptr readonly %x) { ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call float @llvm.maximumnum.f32(float [[RED]], float [[L]]) @@ -63,7 +61,7 @@ exit: define float @maximumnum_intrinsic_fast(ptr readonly %x) { ; CHECK-LABEL: define float @maximumnum_intrinsic_fast( ; CHECK-SAME: ptr readonly [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -85,12 +83,10 @@ define float @maximumnum_intrinsic_fast(ptr readonly %x) { ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmax.v2f32(<2 x float> [[RDX_MINMAX]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call fast float @llvm.maximumnum.f32(float [[RED]], float [[L]]) @@ -121,7 +117,7 @@ exit: define float @minimumnum_intrinsic(ptr readonly %x) { ; CHECK-LABEL: define float @minimumnum_intrinsic( ; CHECK-SAME: ptr readonly [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -143,12 +139,10 @@ define float @minimumnum_intrinsic(ptr readonly %x) { ; CHECK-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call float @llvm.minimumnum.f32(float [[RED]], float [[L]]) @@ -179,7 +173,7 @@ exit: define float @minimumnum_intrinsic_fast(ptr readonly %x) { ; CHECK-LABEL: define float @minimumnum_intrinsic_fast( ; CHECK-SAME: ptr readonly [[X:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -201,12 +195,10 @@ define float @minimumnum_intrinsic_fast(ptr readonly %x) { ; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fmin.v2f32(<2 x float> [[RDX_MINMAX]]) ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV1:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, ptr [[X]], i32 [[IV1]] ; CHECK-NEXT: [[L:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[RED_NEXT]] = tail call fast float @llvm.minimumnum.f32(float [[RED]], float [[L]]) diff --git a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll index 7fef13af8d4a..1fe0bf2713dd 100644 --- a/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll +++ b/llvm/test/Transforms/LoopVectorize/noalias-scope-decl.ll @@ -4,7 +4,7 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias readonly captures(none) [[B:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -27,10 +27,9 @@ define void @test1(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP7]], 1.000000e+02 diff --git a/llvm/test/Transforms/LoopVectorize/optsize.ll b/llvm/test/Transforms/LoopVectorize/optsize.ll index cdb9e9952586..a843aeb1ee8a 100644 --- a/llvm/test/Transforms/LoopVectorize/optsize.ll +++ b/llvm/test/Transforms/LoopVectorize/optsize.ll @@ -253,7 +253,7 @@ define void @pr43371() optsize { ; ; CHECK-LABEL: define void @pr43371( ; CHECK-SAME: ) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -275,12 +275,11 @@ define void @pr43371() optsize { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP28:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY29:.*]] ; CHECK: [[FOR_COND_CLEANUP28]]: ; CHECK-NEXT: unreachable ; CHECK: [[FOR_BODY29]]: -; CHECK-NEXT: [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] +; CHECK-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] ; CHECK-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] ; CHECK-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] @@ -291,7 +290,7 @@ define void @pr43371() optsize { ; ; PGSO-LABEL: define void @pr43371( ; PGSO-SAME: ) #[[ATTR0]] { -; PGSO-NEXT: [[ENTRY:.*]]: +; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] @@ -313,12 +312,11 @@ define void @pr43371() optsize { ; PGSO: [[MIDDLE_BLOCK]]: ; PGSO-NEXT: br label %[[FOR_COND_CLEANUP28:.*]] ; PGSO: [[SCALAR_PH]]: -; PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; PGSO-NEXT: br label %[[FOR_BODY29:.*]] ; PGSO: [[FOR_COND_CLEANUP28]]: ; PGSO-NEXT: unreachable ; PGSO: [[FOR_BODY29]]: -; PGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] +; PGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] ; PGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] ; PGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; PGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] @@ -329,7 +327,7 @@ define void @pr43371() optsize { ; ; NPGSO-LABEL: define void @pr43371( ; NPGSO-SAME: ) #[[ATTR0]] { -; NPGSO-NEXT: [[ENTRY:.*]]: +; NPGSO-NEXT: [[ENTRY:.*:]] ; NPGSO-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; NPGSO: [[VECTOR_PH]]: ; NPGSO-NEXT: br label %[[VECTOR_BODY:.*]] @@ -351,12 +349,11 @@ define void @pr43371() optsize { ; NPGSO: [[MIDDLE_BLOCK]]: ; NPGSO-NEXT: br label %[[FOR_COND_CLEANUP28:.*]] ; NPGSO: [[SCALAR_PH]]: -; NPGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; NPGSO-NEXT: br label %[[FOR_BODY29:.*]] ; NPGSO: [[FOR_COND_CLEANUP28]]: ; NPGSO-NEXT: unreachable ; NPGSO: [[FOR_BODY29]]: -; NPGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] +; NPGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] ; NPGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] ; NPGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; NPGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] @@ -390,7 +387,7 @@ define void @pr43371_pgso() !prof !14 { ; ; CHECK-LABEL: define void @pr43371_pgso( ; CHECK-SAME: ) !prof [[PROF14]] { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -412,12 +409,11 @@ define void @pr43371_pgso() !prof !14 { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_COND_CLEANUP28:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY29:.*]] ; CHECK: [[FOR_COND_CLEANUP28]]: ; CHECK-NEXT: unreachable ; CHECK: [[FOR_BODY29]]: -; CHECK-NEXT: [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] +; CHECK-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] ; CHECK-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] ; CHECK-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] @@ -428,7 +424,7 @@ define void @pr43371_pgso() !prof !14 { ; ; PGSO-LABEL: define void @pr43371_pgso( ; PGSO-SAME: ) !prof [[PROF14]] { -; PGSO-NEXT: [[ENTRY:.*]]: +; PGSO-NEXT: [[ENTRY:.*:]] ; PGSO-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; PGSO: [[VECTOR_PH]]: ; PGSO-NEXT: br label %[[VECTOR_BODY:.*]] @@ -450,12 +446,11 @@ define void @pr43371_pgso() !prof !14 { ; PGSO: [[MIDDLE_BLOCK]]: ; PGSO-NEXT: br label %[[FOR_COND_CLEANUP28:.*]] ; PGSO: [[SCALAR_PH]]: -; PGSO-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] ; PGSO-NEXT: br label %[[FOR_BODY29:.*]] ; PGSO: [[FOR_COND_CLEANUP28]]: ; PGSO-NEXT: unreachable ; PGSO: [[FOR_BODY29]]: -; PGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] +; PGSO-NEXT: [[I24_0170:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[INC37:%.*]], %[[FOR_BODY29]] ] ; PGSO-NEXT: [[ADD33:%.*]] = add i16 undef, [[I24_0170]] ; PGSO-NEXT: [[IDXPROM34:%.*]] = zext i16 [[ADD33]] to i32 ; PGSO-NEXT: [[ARRAYIDX35:%.*]] = getelementptr [2592 x i16], ptr @cm_array, i32 0, i32 [[IDXPROM34]] diff --git a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll index 84f19a19f362..70ce7a7f33ab 100644 --- a/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/outer_loop_scalable.ll @@ -27,8 +27,7 @@ define void @foo() { ; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP6:%.*]] = mul [[TMP4]], splat (i64 1) ; CHECK-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP3]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll index 2b21eb21bb94..356a344c15bd 100644 --- a/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll +++ b/llvm/test/Transforms/LoopVectorize/pr36983-multiple-lcssa.ll @@ -5,7 +5,7 @@ define i16 @duplicate_lcssa(i16 %val) { ; CHECK-LABEL: define i16 @duplicate_lcssa( ; CHECK-SAME: i16 [[VAL:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -22,12 +22,10 @@ define i16 @duplicate_lcssa(i16 %val) { ; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI1:%.*]] = extractelement <4 x i16> [[TMP0]], i32 2 ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[VAL]], %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RES:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[IV_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i16 [ [[VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT]], %[[LOOP]] ] ; CHECK-NEXT: [[IV_NEXT]] = sub nsw i16 [[IV]], 1 ; CHECK-NEXT: [[EXIT_COND:%.*]] = icmp ne i16 [[IV_NEXT]], 0 ; CHECK-NEXT: br i1 [[EXIT_COND]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll index 315ea12f7551..c5dc81b28db2 100644 --- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll +++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll @@ -46,10 +46,9 @@ define i16 @test_true_and_false_branch_equal() { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 99, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[I_07:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC7:%.*]], [[FOR_LATCH:%.*]] ] +; CHECK-NEXT: [[I_07:%.*]] = phi i16 [ 99, [[SCALAR_PH]] ], [ [[INC7:%.*]], [[FOR_LATCH:%.*]] ] ; CHECK-NEXT: [[LV:%.*]] = load i16, ptr @v_38, align 1 ; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i16 [[LV]], 32767 ; CHECK-NEXT: br i1 [[CMP1]], label [[COND_END:%.*]], label [[COND_END]] diff --git a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll index a28bdb838405..9f811f834818 100644 --- a/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll +++ b/llvm/test/Transforms/LoopVectorize/pr55167-fold-tail-live-out.ll @@ -23,8 +23,7 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP0:%.*]] = add <2 x i32> [[VEC_PHI]], splat (i32 10) ; CHECK-NEXT: [[TMP1:%.*]] = add <2 x i32> [[TMP0]], splat (i32 20) ; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> splat (i32 9), <2 x i32> splat (i32 9) -; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_IND]], <2 x i32> splat (i32 9) ; CHECK-NEXT: [[PREDPHI6:%.*]] = select <2 x i1> [[TMP5]], <2 x i32> [[TMP0]], <2 x i32> [[TMP3]] ; CHECK-NEXT: [[PREDPHI7]] = select <2 x i1> [[BROADCAST_SPLAT4]], <2 x i32> [[VEC_PHI]], <2 x i32> [[PREDPHI6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 @@ -36,12 +35,10 @@ define i32 @test(i32 %a, i1 %c.1, i1 %c.2 ) #0 { ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[PREDPHI5]], i32 1 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 6, [[BB:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 35902, [[BB]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] -; CHECK-NEXT: [[V_2:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[P_2:%.*]], [[LOOP_LATCH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 6, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[V_2:%.*]] = phi i32 [ 35902, [[SCALAR_PH]] ], [ [[P_2:%.*]], [[LOOP_LATCH]] ] ; CHECK-NEXT: br i1 [[C_2]], label [[LOOP_LATCH]], label [[BODY_1:%.*]] ; CHECK: body.1: ; CHECK-NEXT: [[V_2_ADD:%.*]] = add i32 [[V_2]], 10 diff --git a/llvm/test/Transforms/LoopVectorize/pr66616.ll b/llvm/test/Transforms/LoopVectorize/pr66616.ll index a39fd471080a..59c64764b901 100644 --- a/llvm/test/Transforms/LoopVectorize/pr66616.ll +++ b/llvm/test/Transforms/LoopVectorize/pr66616.ll @@ -23,10 +23,9 @@ define void @pr66616(ptr %ptr) { ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 ; CHECK-NEXT: br label [[PREHEADER:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_1:%.*]] ; CHECK: loop.1: -; CHECK-NEXT: [[IV_1:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP_1]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[INC:%.*]], [[LOOP_1]] ] ; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[PTR]], align 4 ; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[LOAD]], 1 ; CHECK-NEXT: [[INC]] = add i8 [[IV_1]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll index 724aed888add..f59d4aa99918 100644 --- a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll @@ -338,21 +338,21 @@ define void @switch_to_header(ptr %start) { ; IC1-NEXT: [[ENTRY:.*]]: ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ] +; IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN:.*]] ] ; IC1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC1-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [ -; IC1-NEXT: i64 120, label %[[IF_THEN1]] +; IC1-NEXT: i64 120, label %[[IF_THEN]] ; IC1-NEXT: i64 100, label %[[LOOP_LATCH]] ; IC1-NEXT: ] -; IC1: [[IF_THEN1]]: +; IC1: [[IF_THEN]]: ; IC1-NEXT: br label %[[LOOP_HEADER]] -; IC1: [[IF_THEN:.*:]] +; IC1: [[IF_THEN1:.*:]] ; IC1-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison ; IC1-NEXT: store i64 42, ptr [[GEP]], align 1 ; IC1-NEXT: unreachable ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100 -; IC1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]] +; IC1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; @@ -361,21 +361,21 @@ define void @switch_to_header(ptr %start) { ; IC2-NEXT: [[ENTRY:.*]]: ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ] +; IC2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN:.*]] ] ; IC2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC2-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [ -; IC2-NEXT: i64 120, label %[[IF_THEN1]] +; IC2-NEXT: i64 120, label %[[IF_THEN]] ; IC2-NEXT: i64 100, label %[[LOOP_LATCH]] ; IC2-NEXT: ] -; IC2: [[IF_THEN1]]: +; IC2: [[IF_THEN]]: ; IC2-NEXT: br label %[[LOOP_HEADER]] -; IC2: [[IF_THEN:.*:]] +; IC2: [[IF_THEN1:.*:]] ; IC2-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison ; IC2-NEXT: store i64 42, ptr [[GEP]], align 1 ; IC2-NEXT: unreachable ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100 -; IC2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]] +; IC2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -406,7 +406,7 @@ exit: define void @switch_all_to_default(ptr %start) { ; IC1-LABEL: define void @switch_all_to_default( ; IC1-SAME: ptr [[START:%.*]]) { -; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[ENTRY:.*:]] ; IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC1: [[VECTOR_PH]]: ; IC1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -420,10 +420,9 @@ define void @switch_all_to_default(ptr %start) { ; IC1: [[MIDDLE_BLOCK]]: ; IC1-NEXT: br label %[[EXIT:.*]] ; IC1: [[SCALAR_PH]]: -; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC1-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH]] [ ; IC1-NEXT: i64 120, label %[[LOOP_LATCH]] @@ -439,7 +438,7 @@ define void @switch_all_to_default(ptr %start) { ; ; IC2-LABEL: define void @switch_all_to_default( ; IC2-SAME: ptr [[START:%.*]]) { -; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[ENTRY:.*:]] ; IC2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; IC2: [[VECTOR_PH]]: ; IC2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -455,10 +454,9 @@ define void @switch_all_to_default(ptr %start) { ; IC2: [[MIDDLE_BLOCK]]: ; IC2-NEXT: br label %[[EXIT:.*]] ; IC2: [[SCALAR_PH]]: -; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC2-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH]] [ ; IC2-NEXT: i64 120, label %[[LOOP_LATCH]] diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll index f11b2f25e34b..372c703f4cb2 100644 --- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll +++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll @@ -12,7 +12,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" define void @_Z3fooPf(ptr %a) { ; CHECK-LABEL: define void @_Z3fooPf( ; CHECK-SAME: ptr [[A:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -28,10 +28,9 @@ define void @_Z3fooPf(ptr %a) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[FOR_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[P:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[P]], 2.000000e+00 @@ -44,7 +43,7 @@ define void @_Z3fooPf(ptr %a) { ; ; DEBUGLOC-LABEL: define void @_Z3fooPf( ; DEBUGLOC-SAME: ptr [[A:%.*]]) !dbg [[DBG5:![0-9]+]] { -; DEBUGLOC-NEXT: [[ENTRY:.*]]: +; DEBUGLOC-NEXT: [[ENTRY:.*:]] ; DEBUGLOC-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !dbg [[DBG18:![0-9]+]] ; DEBUGLOC: [[VECTOR_PH]]: ; DEBUGLOC-NEXT: br label %[[VECTOR_BODY:.*]], !dbg [[DBG18]] @@ -60,10 +59,9 @@ define void @_Z3fooPf(ptr %a) { ; DEBUGLOC: [[MIDDLE_BLOCK]]: ; DEBUGLOC-NEXT: br label %[[FOR_END:.*]], !dbg [[DBG24]] ; DEBUGLOC: [[SCALAR_PH]]: -; DEBUGLOC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], !dbg [[DBG19]] ; DEBUGLOC-NEXT: br label %[[FOR_BODY:.*]], !dbg [[DBG18]] ; DEBUGLOC: [[FOR_BODY]]: -; DEBUGLOC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], !dbg [[DBG19]] +; DEBUGLOC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], !dbg [[DBG19]] ; DEBUGLOC-NEXT: #dbg_value(i64 [[INDVARS_IV]], [[META9:![0-9]+]], !DIExpression(), [[DBG19]]) ; DEBUGLOC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]], !dbg [[DBG20]] ; DEBUGLOC-NEXT: #dbg_value(ptr [[ARRAYIDX]], [[META11:![0-9]+]], !DIExpression(), [[DBG20]]) diff --git a/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll index 5c52b1ab2778..07643a1c6e83 100644 --- a/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/remarks-reduction-inloop.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-LABEL: define i32 @reduction_sum( ; CHECK-SAME: ptr noalias captures(none) [[A:%.*]], ptr noalias captures(none) [[B:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -25,12 +25,10 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[BODY:.*]] ; CHECK: [[BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[SUM_TMP:%.*]] = phi i32 [ [[SUM:%.*]], %[[BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BODY]] ], [ 0, %[[SCALAR_PH]] ] +; CHECK-NEXT: [[SUM_TMP:%.*]] = phi i32 [ [[SUM:%.*]], %[[BODY]] ], [ 0, %[[SCALAR_PH]] ] ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[GEP0]], align 4 ; CHECK-NEXT: [[SUM]] = add i32 [[SUM_TMP]], [[LOAD0]] diff --git a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll index cea16c9eb751..4895f6a83d56 100644 --- a/llvm/test/Transforms/LoopVectorize/reverse_induction.ll +++ b/llvm/test/Transforms/LoopVectorize/reverse_induction.ll @@ -9,7 +9,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define i32 @reverse_induction_i64(i64 %startval, ptr %ptr) { ; CHECK-LABEL: define i32 @reverse_induction_i64( ; CHECK-SAME: i64 [[STARTVAL:%.*]], ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -38,14 +38,11 @@ define i32 @reverse_induction_i64(i64 %startval, ptr %ptr) { ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[LOOPEND:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[STARTVAL]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[ADD_I7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ], [ [[INC4:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ADD_I7:%.*]] = phi i64 [ [[STARTVAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC4:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ADD_I]] = add i64 [[ADD_I7]], -1 ; CHECK-NEXT: [[KIND__I:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[ADD_I]] ; CHECK-NEXT: [[TMP_I1:%.*]] = load i32, ptr [[KIND__I]], align 4 @@ -80,7 +77,7 @@ loopend: define i32 @reverse_induction_i128(i128 %startval, ptr %ptr) { ; CHECK-LABEL: define i32 @reverse_induction_i128( ; CHECK-SAME: i128 [[STARTVAL:%.*]], ptr [[PTR:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -109,14 +106,11 @@ define i32 @reverse_induction_i128(i128 %startval, ptr %ptr) { ; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) ; CHECK-NEXT: br label %[[LOOPEND:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i128 [ [[STARTVAL]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[FOR_BODY:.*]] ; CHECK: [[FOR_BODY]]: -; CHECK-NEXT: [[ADD_I7:%.*]] = phi i128 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], %[[SCALAR_PH]] ], [ [[INC4:%.*]], %[[FOR_BODY]] ] -; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[ADD_I7:%.*]] = phi i128 [ [[STARTVAL]], %[[SCALAR_PH]] ], [ [[ADD_I:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[I_06:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC4:%.*]], %[[FOR_BODY]] ] +; CHECK-NEXT: [[REDUX5:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[INC_REDUX:%.*]], %[[FOR_BODY]] ] ; CHECK-NEXT: [[ADD_I]] = add i128 [[ADD_I7]], -1 ; CHECK-NEXT: [[KIND__I:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i128 [[ADD_I]] ; CHECK-NEXT: [[TMP_I1:%.*]] = load i32, ptr [[KIND__I]], align 4 @@ -248,7 +242,7 @@ loopend: define void @reverse_forward_induction_i64_i8() { ; CHECK-LABEL: define void @reverse_forward_induction_i64_i8() { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -277,12 +271,10 @@ define void @reverse_forward_induction_i64_i8() { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[WHILE_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[WHILE_BODY:.*]] ; CHECK: [[WHILE_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] -; CHECK-NEXT: [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ 0, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[WHILE_BODY]] ] ; CHECK-NEXT: [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1 ; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[INC]] to i32 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]] @@ -316,7 +308,7 @@ while.end: define void @reverse_forward_induction_i64_i8_signed() { ; CHECK-LABEL: define void @reverse_forward_induction_i64_i8_signed() { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -345,12 +337,10 @@ define void @reverse_forward_induction_i64_i8_signed() { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[WHILE_END:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1023, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i8 [ -127, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[WHILE_BODY:.*]] ; CHECK: [[WHILE_BODY]]: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] -; CHECK-NEXT: [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ [[BC_RESUME_VAL2]], %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1023, %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[FORWARD_INDUCTION_05:%.*]] = phi i8 [ -127, %[[SCALAR_PH]] ], [ [[INC:%.*]], %[[WHILE_BODY]] ] ; CHECK-NEXT: [[INC]] = add i8 [[FORWARD_INDUCTION_05]], 1 ; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[INC]] to i32 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 [[INDVARS_IV]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll index 37ed28993cf6..60da3368b664 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-first-order-recurrence.ll @@ -635,8 +635,7 @@ define i32 @extract_second_last_iteration(ptr %cval, i32 %x) { ; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv4i32() ; CHECK-VF4UF1-NEXT: [[TMP7:%.*]] = mul [[TMP6]], splat (i32 1) ; CHECK-VF4UF1-NEXT: [[INDUCTION:%.*]] = add zeroinitializer, [[TMP7]] -; CHECK-VF4UF1-NEXT: [[TMP8:%.*]] = mul i32 1, [[TMP3]] -; CHECK-VF4UF1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP8]], i64 0 +; CHECK-VF4UF1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP3]], i64 0 ; CHECK-VF4UF1-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-VF4UF1-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK-VF4UF1: [[VECTOR_BODY]]: diff --git a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll index 70772dcd0cdf..89f15ea5e918 100644 --- a/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll +++ b/llvm/test/Transforms/LoopVectorize/scev-exit-phi-invalidation.ll @@ -21,10 +21,9 @@ define void @test_pr63368(i1 %c, ptr %A) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT_1:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] ; CHECK: loop.1.header: -; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ] +; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP_1_LATCH:%.*]] ] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: br i1 [[C]], label [[LOOP_1_LATCH]], label [[LOOP_1_LATCH]] ; CHECK: loop.1.latch: @@ -65,10 +64,10 @@ define void @test_pr63368(i1 %c, ptr %A) { ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_2:%.*]], label [[SCALAR_PH2]] ; CHECK: scalar.ph2: -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK7]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP9]], [[MIDDLE_BLOCK7]] ], [ 0, [[EXIT_1]] ], [ 0, [[VECTOR_SCEVCHECK]] ] ; CHECK-NEXT: br label [[LOOP_2:%.*]] ; CHECK: loop.2: -; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL8]], [[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ] +; CHECK-NEXT: [[IV_2:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH2]] ], [ [[IV_2_NEXT:%.*]], [[LOOP_2]] ] ; CHECK-NEXT: [[IV_2_NEXT]] = add i8 [[IV_2]], 1 ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i8 [[IV_2_NEXT]] ; CHECK-NEXT: store i8 0, ptr [[GEP_A]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll b/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll index 285c6742a7f5..4b080ddaa119 100644 --- a/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll +++ b/llvm/test/Transforms/LoopVectorize/select-neg-cond.ll @@ -4,7 +4,7 @@ define void @neg_cond(ptr noalias %p, ptr noalias %q) { ; CHECK-LABEL: define void @neg_cond( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[Q:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -21,10 +21,9 @@ define void @neg_cond(ptr noalias %p, ptr noalias %q) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[P_GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[P_GEP]], align 4 ; CHECK-NEXT: [[Q_GEP:%.*]] = getelementptr i32, ptr [[Q]], i32 [[IV]] diff --git a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll index 37d75ffe2c2f..30585d07a8e6 100644 --- a/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll +++ b/llvm/test/Transforms/LoopVectorize/select-reduction-start-value-may-be-undef-or-poison.ll @@ -26,12 +26,10 @@ define i64 @pr62565_incoming_value_known_undef(i64 %a, ptr %src) { ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 undef ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ undef, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ undef, [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 1 @@ -86,12 +84,10 @@ define i64 @pr62565_incoming_value_known_poison(i64 %a, ptr %src) { ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 poison ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ poison, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ poison, [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 1 @@ -146,12 +142,10 @@ define i64 @pr62565_incoming_value_may_be_poison(i64 %a, ptr %src, i64 %start) { ; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP5]], i64 [[A]], i64 [[START]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[START]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[SCALAR_PH]] ], [ [[ADD:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[START]], [[SCALAR_PH]] ], [ [[SELECT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4 ; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[L]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll new file mode 100644 index 000000000000..660212378ae6 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll @@ -0,0 +1,128 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -p loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S %s | FileCheck --check-prefix=VF4IC2 %s +; RUN: opt -p loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -S %s | FileCheck --check-prefix=VF8IC1 %s + +; Test case from https://github.com/llvm/llvm-project/issues/153946. +; %shr and thus %early.cond will be poison from %iv == 4 onwards. +; TODO: Make sure the mask being poison does not propagate across lanes in the +; OR reduction when computing the early exit condition in the vector loop. +define noundef i32 @f(i32 noundef %g) { +; VF4IC2-LABEL: define noundef i32 @f( +; VF4IC2-SAME: i32 noundef [[G:%.*]]) { +; VF4IC2-NEXT: [[ENTRY:.*:]] +; VF4IC2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF4IC2: [[VECTOR_PH]]: +; VF4IC2-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[G]], i64 0 +; VF4IC2-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; VF4IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; VF4IC2: [[VECTOR_BODY]]: +; VF4IC2-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4IC2-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF4IC2-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], splat (i32 4) +; VF4IC2-NEXT: [[TMP0:%.*]] = shl nuw nsw <4 x i32> [[VEC_IND]], splat (i32 3) +; VF4IC2-NEXT: [[TMP1:%.*]] = shl nuw nsw <4 x i32> [[STEP_ADD]], splat (i32 3) +; VF4IC2-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[BROADCAST_SPLAT]], [[TMP0]] +; VF4IC2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[BROADCAST_SPLAT]], [[TMP1]] +; VF4IC2-NEXT: [[TMP4:%.*]] = icmp ne <4 x i32> [[TMP2]], zeroinitializer +; VF4IC2-NEXT: [[TMP5:%.*]] = icmp ne <4 x i32> [[TMP3]], zeroinitializer +; VF4IC2-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; VF4IC2-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP5]] +; VF4IC2-NEXT: [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]]) +; VF4IC2-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], splat (i32 4) +; VF4IC2-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF4IC2: [[MIDDLE_SPLIT]]: +; VF4IC2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3 +; VF4IC2-NEXT: br i1 [[TMP7]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF4IC2: [[MIDDLE_BLOCK]]: +; VF4IC2-NEXT: br label %[[RETURN:.*]] +; VF4IC2: [[VECTOR_EARLY_EXIT]]: +; VF4IC2-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true) +; VF4IC2-NEXT: [[TMP10:%.*]] = add i64 4, [[TMP9]] +; VF4IC2-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true) +; VF4IC2-NEXT: [[TMP12:%.*]] = add i64 0, [[TMP11]] +; VF4IC2-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP11]], 4 +; VF4IC2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 [[TMP10]] +; VF4IC2-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32 +; VF4IC2-NEXT: [[TMP16:%.*]] = add i32 [[INDEX]], [[TMP15]] +; VF4IC2-NEXT: br label %[[RETURN]] +; VF4IC2: [[SCALAR_PH]]: +; VF4IC2-NEXT: br label %[[LOOP_HEADER:.*]] +; VF4IC2: [[LOOP_HEADER]]: +; VF4IC2-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF4IC2-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IV]], 3 +; VF4IC2-NEXT: [[SHR:%.*]] = ashr i32 [[G]], [[MUL]] +; VF4IC2-NEXT: [[EARLY_COND:%.*]] = icmp eq i32 [[SHR]], 0 +; VF4IC2-NEXT: br i1 [[EARLY_COND]], label %[[LOOP_LATCH]], label %[[RETURN]] +; VF4IC2: [[LOOP_LATCH]]: +; VF4IC2-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; VF4IC2-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 8 +; VF4IC2-NEXT: br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; VF4IC2: [[RETURN]]: +; VF4IC2-NEXT: [[RES:%.*]] = phi i32 [ [[SHR]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ [[TMP16]], %[[VECTOR_EARLY_EXIT]] ] +; VF4IC2-NEXT: ret i32 [[RES]] +; +; VF8IC1-LABEL: define noundef i32 @f( +; VF8IC1-SAME: i32 noundef [[G:%.*]]) { +; VF8IC1-NEXT: [[ENTRY:.*:]] +; VF8IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; VF8IC1: [[VECTOR_PH]]: +; VF8IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[G]], i64 0 +; VF8IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer +; VF8IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; VF8IC1: [[VECTOR_BODY]]: +; VF8IC1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8IC1-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] +; VF8IC1-NEXT: [[TMP0:%.*]] = shl nuw nsw <8 x i32> [[VEC_IND]], splat (i32 3) +; VF8IC1-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[BROADCAST_SPLAT]], [[TMP0]] +; VF8IC1-NEXT: [[TMP2:%.*]] = icmp ne <8 x i32> [[TMP1]], zeroinitializer +; VF8IC1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 +; VF8IC1-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP2]]) +; VF8IC1-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], splat (i32 8) +; VF8IC1-NEXT: br i1 true, label %[[MIDDLE_SPLIT:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VF8IC1: [[MIDDLE_SPLIT]]: +; VF8IC1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; VF8IC1-NEXT: br i1 [[TMP3]], label %[[VECTOR_EARLY_EXIT:.*]], label %[[MIDDLE_BLOCK:.*]] +; VF8IC1: [[MIDDLE_BLOCK]]: +; VF8IC1-NEXT: br label %[[RETURN:.*]] +; VF8IC1: [[VECTOR_EARLY_EXIT]]: +; VF8IC1-NEXT: [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true) +; VF8IC1-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32 +; VF8IC1-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], [[TMP6]] +; VF8IC1-NEXT: br label %[[RETURN]] +; VF8IC1: [[SCALAR_PH]]: +; VF8IC1-NEXT: br label %[[LOOP_HEADER:.*]] +; VF8IC1: [[LOOP_HEADER]]: +; VF8IC1-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8IC1-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[IV]], 3 +; VF8IC1-NEXT: [[SHR:%.*]] = ashr i32 [[G]], [[MUL]] +; VF8IC1-NEXT: [[EARLY_COND:%.*]] = icmp eq i32 [[SHR]], 0 +; VF8IC1-NEXT: br i1 [[EARLY_COND]], label %[[LOOP_LATCH]], label %[[RETURN]] +; VF8IC1: [[LOOP_LATCH]]: +; VF8IC1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; VF8IC1-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 8 +; VF8IC1-NEXT: br i1 [[EC]], label %[[RETURN]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; VF8IC1: [[RETURN]]: +; VF8IC1-NEXT: [[RES:%.*]] = phi i32 [ [[SHR]], %[[LOOP_LATCH]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VECTOR_EARLY_EXIT]] ] +; VF8IC1-NEXT: ret i32 [[RES]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %mul = shl nuw nsw i32 %iv, 3 + %shr = ashr i32 %g, %mul + %early.cond = icmp eq i32 %shr, 0 + br i1 %early.cond, label %loop.latch, label %return + +loop.latch: + %iv.next = add nuw nsw i32 %iv, 1 + %ec = icmp eq i32 %iv.next, 8 + br i1 %ec, label %return, label %loop.header + +return: + %res = phi i32 [ %shr, %loop.latch ], [ %iv, %loop.header ] + ret i32 %res +} + + diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll index f329a18f3eae..df40ba0d4ac3 100644 --- a/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll @@ -31,10 +31,9 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si ; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll index c648bedabc05..5b9e75a9f7eb 100644 --- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave-hint.ll @@ -5,7 +5,7 @@ declare void @init_mem(ptr, i64); define i64 @multi_exiting_to_different_exits_live_in_exit_values() { ; VF4IC4-LABEL: define i64 @multi_exiting_to_different_exits_live_in_exit_values() { -; VF4IC4-NEXT: [[ENTRY:.*]]: +; VF4IC4-NEXT: [[ENTRY:.*:]] ; VF4IC4-NEXT: [[SRC:%.*]] = alloca [128 x i32], align 4 ; VF4IC4-NEXT: call void @init_mem(ptr [[SRC]]) ; VF4IC4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] @@ -40,10 +40,9 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() { ; VF4IC4: [[VECTOR_EARLY_EXIT]]: ; VF4IC4-NEXT: br label %[[E1:.*]] ; VF4IC4: [[SCALAR_PH]]: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF4IC4-NEXT: br label %[[LOOP_HEADER:.*]] ; VF4IC4: [[LOOP_HEADER]]: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], %[[LOOP_LATCH:.*]] ], [ 0, %[[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] ; VF4IC4-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 ; VF4IC4-NEXT: [[C_1:%.*]] = icmp eq i32 [[L]], 10 diff --git a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll index 3f51c72a6d3d..678b171832c3 100644 --- a/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/single-early-exit-interleave.ll @@ -40,10 +40,9 @@ define i64 @multi_exiting_to_different_exits_live_in_exit_values() { ; VF4IC4: vector.early.exit: ; VF4IC4-NEXT: br label [[E1:%.*]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP_HEADER:%.*]] ; VF4IC4: loop.header: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV]] ; VF4IC4-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 ; VF4IC4-NEXT: [[C_1:%.*]] = icmp eq i32 [[L]], 10 @@ -149,10 +148,9 @@ define i64 @same_exit_block_pre_inc_use1() { ; VF4IC4-NEXT: [[TMP10:%.*]] = add i64 3, [[TMP9]] ; VF4IC4-NEXT: br label [[LOOP_END]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP:%.*]] ; VF4IC4: loop: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]] ; VF4IC4-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1 ; VF4IC4-NEXT: [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]] @@ -247,10 +245,9 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() { ; VF4IC4-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP7]] ; VF4IC4-NEXT: br label [[LOOP_END]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[P1]], [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP:%.*]] ; VF4IC4: loop: -; VF4IC4-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[P1]], [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[LD1:%.*]] = load i8, ptr [[PTR]], align 1 ; VF4IC4-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72 ; VF4IC4-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] @@ -348,10 +345,9 @@ define i64 @same_exit_block_post_inc_use() { ; VF4IC4-NEXT: [[TMP10:%.*]] = add i64 3, [[TMP9]] ; VF4IC4-NEXT: br label [[LOOP_END]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP:%.*]] ; VF4IC4: loop: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]] ; VF4IC4-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1 ; VF4IC4-NEXT: [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]] @@ -456,10 +452,9 @@ define i64 @diff_exit_block_pre_inc_use1() { ; VF4IC4-NEXT: [[TMP10:%.*]] = add i64 3, [[TMP9]] ; VF4IC4-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP:%.*]] ; VF4IC4: loop: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]] ; VF4IC4-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1 ; VF4IC4-NEXT: [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]] @@ -571,10 +566,9 @@ define i64 @diff_exit_block_post_inc_use1() { ; VF4IC4-NEXT: [[TMP10:%.*]] = add i64 3, [[TMP9]] ; VF4IC4-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP:%.*]] ; VF4IC4: loop: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]] ; VF4IC4-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1 ; VF4IC4-NEXT: [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]] @@ -823,10 +817,9 @@ define i8 @same_exit_block_use_loaded_value() { ; VF4IC4-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i8 [[TMP40]], i8 [[TMP38]] ; VF4IC4-NEXT: br label [[LOOP_END]] ; VF4IC4: scalar.ph: -; VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; VF4IC4-NEXT: br label [[LOOP:%.*]] ; VF4IC4: loop: -; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; VF4IC4-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 0, [[SCALAR_PH]] ] ; VF4IC4-NEXT: [[GEP_P1:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IV]] ; VF4IC4-NEXT: [[LD1:%.*]] = load i8, ptr [[GEP_P1]], align 1 ; VF4IC4-NEXT: [[GEP_P2:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IV]] diff --git a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll index 842ff910c89d..dd3521fd99c8 100644 --- a/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll +++ b/llvm/test/Transforms/LoopVectorize/single-value-blend-phis.ll @@ -31,10 +31,9 @@ define void @single_incoming_phi_no_blend_mask(i64 %a, i64 %b) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: br label [[LOOP_COND:%.*]] ; CHECK: loop.cond: @@ -104,9 +103,8 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) { ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP4]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP6]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], splat (i1 true) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> [[WIDE_LOAD]], <2 x i16> splat (i16 1) -; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP8]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP3]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], ptr [[TMP9]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 @@ -116,10 +114,9 @@ define void @single_incoming_phi_with_blend_mask(i64 %a, i64 %b) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]] ; CHECK-NEXT: br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]] @@ -206,10 +203,9 @@ define void @multiple_incoming_phi_with_blend_mask(i64 %a, ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]] @@ -292,9 +288,8 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) { ; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP15:%.*]] = icmp sle <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer -; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP2]], splat (i1 true) ; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP16]], <2 x i16> [[TMP14]], <2 x i16> splat (i16 1) -; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> zeroinitializer, <2 x i16> [[PREDPHI]] +; CHECK-NEXT: [[PREDPHI3:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[PREDPHI]], <2 x i16> zeroinitializer ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [32 x i16], ptr @dst, i16 0, i64 [[INDEX]] ; CHECK-NEXT: store <2 x i16> [[PREDPHI3]], ptr [[TMP18]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 @@ -304,10 +299,9 @@ define void @single_incoming_needs_predication(i64 %a, i64 %b) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]] ; CHECK-NEXT: br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]] @@ -379,10 +373,9 @@ define void @duplicated_incoming_blocks_blend(i32 %x, ptr %ptr) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[C_0:%.*]] = icmp ugt i32 [[IV]], [[X:%.*]] ; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[LOOP_LATCH]] ; CHECK: loop.latch: diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll index 2c0a6f1b032c..b3451704ea51 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll @@ -34,10 +34,9 @@ define i64 @same_exit_block_phi_of_consts() { ; CHECK: vector.early.exit: ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -109,10 +108,9 @@ define i64 @diff_exit_block_phi_of_consts() { ; CHECK: vector.early.exit: ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -292,10 +290,9 @@ define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) { ; CHECK: vector.early.exit: ; CHECK-NEXT: br label [[EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IND:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[IND:%.*]] = phi i32 [ -10, [[SCALAR_PH]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ] ; CHECK-NEXT: br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT]] ; CHECK: for.inc: ; CHECK-NEXT: [[IND_NEXT]] = add nsw i32 [[IND]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll index 940e3980a01a..f4b35c779a4b 100644 --- a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll +++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll @@ -36,10 +36,9 @@ define i64 @same_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -116,12 +115,10 @@ define i32 @same_exit_block_pre_inc_use1_iv64_endi32_step2() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i32 9, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ 9, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ 9, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -198,14 +195,11 @@ define i32 @same_exit_block_pre_inc_use1_iv128_endi32_step2() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i32 9, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i128 [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32 [ 9, [[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi ptr [ [[P1]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i128 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i128 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ 9, [[SCALAR_PH]] ] +; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC]] ], [ [[P1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[PTR]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3 ; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] @@ -280,12 +274,10 @@ define float @same_exit_block_pre_inc_use1_iv64_endf32() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = fadd fast float 9.000000e+00, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi float [ 9.000000e+00, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INDEX2:%.*]] = phi float [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX2:%.*]] = phi float [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ 9.000000e+00, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -364,12 +356,10 @@ define ptr @same_exit_block_pre_inc_use1_iv64_endptr() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP20]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi ptr [ [[P2]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INDEX2:%.*]] = phi ptr [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX2:%.*]] = phi ptr [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[P2]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -443,10 +433,9 @@ define ptr @same_exit_block_pre_inc_use1_ivptr() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP8]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[P1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[P1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[PTR]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72 ; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END]] @@ -517,10 +506,9 @@ define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -597,10 +585,9 @@ define i64 @same_exit_block_pre_inc_use1_gep_two_indices() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]] @@ -675,10 +662,9 @@ define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -750,10 +736,9 @@ define i64 @same_exit_block_pre_inc_use2() { ; CHECK: vector.early.exit: ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -828,10 +813,9 @@ define i64 @same_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -907,10 +891,9 @@ define i64 @same_exit_block_pre_inc_use4() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]] @@ -981,10 +964,9 @@ define i64 @same_exit_block_post_inc_use() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -1056,10 +1038,9 @@ define ptr @same_exit_block_post_inc_use1_ivptr() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = getelementptr i8, ptr [[P1]], i64 [[TMP9]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[P1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[P1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[PTR]], align 1 ; CHECK-NEXT: [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 1 ; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 72 @@ -1128,10 +1109,9 @@ define i64 @same_exit_block_post_inc_use2() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -1206,10 +1186,9 @@ define i64 @diff_exit_block_pre_inc_use1() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -1288,10 +1267,9 @@ define i64 @diff_exit_block_pre_inc_use2() { ; CHECK: vector.early.exit: ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -1373,10 +1351,9 @@ define i64 @diff_exit_block_pre_inc_use3() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -1456,10 +1433,9 @@ define i64 @diff_exit_block_post_inc_use1() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] @@ -1542,10 +1518,9 @@ define i64 @diff_exit_block_post_inc_use2() { ; CHECK-NEXT: [[TMP21:%.*]] = add i64 3, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 @@ -1630,12 +1605,10 @@ define i64 @diff_exit_block_post_inc_use3(i64 %start) { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 [[START]], [[TMP12]] ; CHECK-NEXT: br label [[LOOP_EARLY_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[START]], [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[BC_RESUME_VAL5]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[INDEX2_NEXT:%.*]], [[LOOP_INC]] ], [ [[START]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 1 ; CHECK-NEXT: [[INDEX2_NEXT]] = add i64 [[INDEX2]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] @@ -1719,10 +1692,9 @@ define i64 @loop_contains_safe_call() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]]) @@ -1794,10 +1766,9 @@ define i64 @loop_contains_safe_div() { ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[LD1]], 20000 @@ -1870,10 +1841,9 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align( ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[LD1]], 1 @@ -2077,10 +2047,9 @@ define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p ; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP10]] ; CHECK-NEXT: br label [[LOOP_END]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 3, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]] ; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]] diff --git a/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll b/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll index 971921a9c1d6..8e47f19efa24 100644 --- a/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll +++ b/llvm/test/Transforms/LoopVectorize/strided-accesses-interleave-only.ll @@ -23,10 +23,9 @@ define void @test_variable_stride(ptr %dst, i32 %scale) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IDX:%.*]] = mul i32 [[IV]], [[SCALE]] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr i16, ptr [[DST]], i32 [[IDX]] ; CHECK-NEXT: store i32 [[IV]], ptr [[GEP]], align 2 diff --git a/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll b/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll index a687ecc33af5..9e94768fc2cb 100644 --- a/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll +++ b/llvm/test/Transforms/LoopVectorize/trunc-extended-icmps.ll @@ -145,10 +145,9 @@ define void @ext_cmp(ptr %src.1, ptr %src.2, ptr noalias %dst) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds i16, ptr [[SRC_1]], i64 [[IV]] ; CHECK-NEXT: [[I2:%.*]] = load i16, ptr [[GEP_SRC_1]], align 2 ; CHECK-NEXT: [[I3:%.*]] = sext i16 [[I2]] to i32 diff --git a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll index 66dc785d95f4..ac9b03567aab 100644 --- a/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll +++ b/llvm/test/Transforms/LoopVectorize/trunc-loads-p16.ll @@ -28,10 +28,9 @@ define void @pr77468(ptr noalias %src, ptr noalias %dst, i1 %x) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i16 [[IV]] ; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1 ; CHECK-NEXT: [[X_EXT:%.*]] = zext i1 [[X]] to i32 diff --git a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll index 10e9ae80beb7..9e710763be13 100644 --- a/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll +++ b/llvm/test/Transforms/LoopVectorize/trunc-shifts.ll @@ -26,10 +26,9 @@ define void @test_pr47927_lshr_const_shift_ops(ptr %dst, i32 %f) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L:%.*]] = lshr i32 [[F]], 18 ; CHECK-NEXT: [[L_T:%.*]] = trunc i32 [[L]] to i8 ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 @@ -84,10 +83,9 @@ define void @test_shl_const_shift_ops(ptr %dst, i32 %f) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L:%.*]] = shl i32 [[F]], 18 ; CHECK-NEXT: [[L_T:%.*]] = trunc i32 [[L]] to i8 ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 @@ -142,10 +140,9 @@ define void @test_ashr_const_shift_ops(ptr %dst, i32 %f) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L:%.*]] = ashr i32 [[F]], 18 ; CHECK-NEXT: [[L_T:%.*]] = trunc i32 [[L]] to i8 ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 @@ -200,10 +197,9 @@ define void @test_shl_const_shifted_op(ptr %dst, i32 %f) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV_EXT]] ; CHECK-NEXT: [[LV:%.*]] = load i8, ptr [[GEP]], align 1 @@ -263,10 +259,9 @@ define void @test_lshr_by_18(ptr %A) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_EXT]] ; CHECK-NEXT: [[LV:%.*]] = load i8, ptr [[GEP]], align 1 @@ -325,10 +320,9 @@ define void @test_lshr_by_4(ptr %A) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[IV_EXT:%.*]] = zext i8 [[IV]] to i64 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV_EXT]] ; CHECK-NEXT: [[LV:%.*]] = load i8, ptr [[GEP]], align 1 diff --git a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll index c67817556c16..c92dc1a641c6 100644 --- a/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll +++ b/llvm/test/Transforms/LoopVectorize/uitofp-preserve-nneg.ll @@ -24,10 +24,9 @@ define void @uitofp_preserve_nneg(ptr %result, i32 %size, float %y) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER4]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER4]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[CONV:%.*]] = uitofp nneg i32 [[TMP4]] to float ; CHECK-NEXT: [[TMP5:%.*]] = fmul float [[CONV]], [[Y]] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = zext nneg i32 [[TMP4]] to i64 diff --git a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll index b500acb79782..5c464419f36d 100644 --- a/llvm/test/Transforms/LoopVectorize/uniform-blend.ll +++ b/llvm/test/Transforms/LoopVectorize/uniform-blend.ll @@ -6,7 +6,7 @@ define void @blend_uniform_iv_trunc(i1 %c) { ; CHECK-LABEL: define void @blend_uniform_iv_trunc( ; CHECK-SAME: i1 [[C:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -22,10 +22,9 @@ define void @blend_uniform_iv_trunc(i1 %c) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16 ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]] ; CHECK: [[LOOP_NEXT]]: @@ -66,7 +65,7 @@ exit: ; preds = %loop.latch define void @blend_uniform_iv(i1 %c) { ; CHECK-LABEL: define void @blend_uniform_iv( ; CHECK-SAME: i1 [[C:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -81,10 +80,9 @@ define void @blend_uniform_iv(i1 %c) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]] ; CHECK: [[LOOP_NEXT]]: ; CHECK-NEXT: br label %[[LOOP_LATCH]] @@ -124,7 +122,7 @@ exit: ; preds = %loop.latch define void @blend_chain_iv(i1 %c) { ; CHECK-LABEL: define void @blend_chain_iv( ; CHECK-SAME: i1 [[C:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0 @@ -156,10 +154,9 @@ define void @blend_chain_iv(i1 %c) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] ; CHECK: [[LOOP_HEADER]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_NEXT:.*]], label %[[LOOP_LATCH]] ; CHECK: [[LOOP_NEXT]]: ; CHECK-NEXT: br i1 [[C]], label %[[LOOP_NEXT_2:.*]], label %[[LOOP_NEXT_3:.*]] diff --git a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll index 0541c9d92240..ef1acc0349a6 100644 --- a/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll +++ b/llvm/test/Transforms/LoopVectorize/unused-blend-mask-for-first-operand.ll @@ -25,10 +25,9 @@ define void @test_not_first_lane_only_constant(ptr %A, ptr noalias %B) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[IV]] ; CHECK-NEXT: br i1 false, label [[LOOP_LATCH]], label [[ELSE_1:%.*]] ; CHECK: else.1: @@ -101,10 +100,9 @@ define void @test_not_first_lane_only_wide_compare(ptr %A, ptr noalias %B, i16 % ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[IV]] ; CHECK-NEXT: [[L_0:%.*]] = load i16, ptr [[GEP_A]], align 2 ; CHECK-NEXT: [[C_0:%.*]] = icmp ult i16 [[L_0]], [[X]] @@ -183,10 +181,9 @@ define void @test_not_first_lane_only_wide_compare_incoming_order_swapped(ptr %A ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i16 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i16, ptr [[A]], i16 [[IV]] ; CHECK-NEXT: [[L_0:%.*]] = load i16, ptr [[GEP_A]], align 2 ; CHECK-NEXT: [[C_0:%.*]] = icmp ult i16 [[L_0]], [[X]] diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll index e1185207813e..f1e68d47848a 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-early-exit.ll @@ -10,7 +10,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosync nofree { ; VF8UF1-LABEL: define i8 @test_early_exit_max_tc_less_than_16( ; VF8UF1-SAME: ptr dereferenceable(16) [[A:%.*]]) #[[ATTR0:[0-9]+]] { -; VF8UF1-NEXT: [[ENTRY:.*]]: +; VF8UF1-NEXT: [[ENTRY:.*:]] ; VF8UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF8UF1: [[VECTOR_PH]]: ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -31,10 +31,9 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF8UF1: [[VECTOR_EARLY_EXIT]]: ; VF8UF1-NEXT: br label %[[EXIT]] ; VF8UF1: [[SCALAR_PH]]: -; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF8UF1-NEXT: br label %[[LOOP_HEADER:.*]] ; VF8UF1: [[LOOP_HEADER]]: -; VF8UF1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8UF1-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; VF8UF1-NEXT: [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]] ; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1 ; VF8UF1-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 @@ -49,7 +48,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; ; VF8UF2-LABEL: define i8 @test_early_exit_max_tc_less_than_16( ; VF8UF2-SAME: ptr dereferenceable(16) [[A:%.*]]) #[[ATTR0:[0-9]+]] { -; VF8UF2-NEXT: [[ENTRY:.*]]: +; VF8UF2-NEXT: [[ENTRY:.*:]] ; VF8UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF8UF2: [[VECTOR_PH]]: ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -69,10 +68,9 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF8UF2: [[VECTOR_EARLY_EXIT]]: ; VF8UF2-NEXT: br label %[[EXIT]] ; VF8UF2: [[SCALAR_PH]]: -; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF8UF2-NEXT: br label %[[LOOP_HEADER:.*]] ; VF8UF2: [[LOOP_HEADER]]: -; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8UF2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; VF8UF2-NEXT: [[P_SRC:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] ; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC]], align 1 ; VF8UF2-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 @@ -87,7 +85,7 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; ; VF16UF1-LABEL: define i8 @test_early_exit_max_tc_less_than_16( ; VF16UF1-SAME: ptr dereferenceable(16) [[A:%.*]]) #[[ATTR0:[0-9]+]] { -; VF16UF1-NEXT: [[ENTRY:.*]]: +; VF16UF1-NEXT: [[ENTRY:.*:]] ; VF16UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF16UF1: [[VECTOR_PH]]: ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -103,10 +101,9 @@ define i8 @test_early_exit_max_tc_less_than_16(ptr dereferenceable(16) %A) nosyn ; VF16UF1: [[VECTOR_EARLY_EXIT]]: ; VF16UF1-NEXT: br label %[[EXIT]] ; VF16UF1: [[SCALAR_PH]]: -; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF16UF1-NEXT: br label %[[LOOP_HEADER:.*]] ; VF16UF1: [[LOOP_HEADER]]: -; VF16UF1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF16UF1-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; VF16UF1-NEXT: [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]] ; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1 ; VF16UF1-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 @@ -142,7 +139,7 @@ exit: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr dereferenceable(16) %A) nosync nofree { ; VF8UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside( ; VF8UF1-SAME: ptr dereferenceable(16) [[A:%.*]]) #[[ATTR0]] { -; VF8UF1-NEXT: [[ENTRY:.*]]: +; VF8UF1-NEXT: [[ENTRY:.*:]] ; VF8UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF8UF1: [[VECTOR_PH]]: ; VF8UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -165,10 +162,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF8UF1-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[FIRST_ACTIVE_LANE]] ; VF8UF1-NEXT: br label %[[EXIT]] ; VF8UF1: [[SCALAR_PH]]: -; VF8UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF8UF1-NEXT: br label %[[LOOP_HEADER:.*]] ; VF8UF1: [[LOOP_HEADER]]: -; VF8UF1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8UF1-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; VF8UF1-NEXT: [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]] ; VF8UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1 ; VF8UF1-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 @@ -183,7 +179,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; ; VF8UF2-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside( ; VF8UF2-SAME: ptr dereferenceable(16) [[A:%.*]]) #[[ATTR0]] { -; VF8UF2-NEXT: [[ENTRY:.*]]: +; VF8UF2-NEXT: [[ENTRY:.*:]] ; VF8UF2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF8UF2: [[VECTOR_PH]]: ; VF8UF2-NEXT: br label %[[VECTOR_BODY:.*]] @@ -210,10 +206,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF8UF2-NEXT: [[TMP12:%.*]] = add i64 0, [[TMP11]] ; VF8UF2-NEXT: br label %[[EXIT]] ; VF8UF2: [[SCALAR_PH]]: -; VF8UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF8UF2-NEXT: br label %[[LOOP_HEADER:.*]] ; VF8UF2: [[LOOP_HEADER]]: -; VF8UF2-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF8UF2-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; VF8UF2-NEXT: [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]] ; VF8UF2-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1 ; VF8UF2-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 @@ -228,7 +223,7 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; ; VF16UF1-LABEL: define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside( ; VF16UF1-SAME: ptr dereferenceable(16) [[A:%.*]]) #[[ATTR0]] { -; VF16UF1-NEXT: [[ENTRY:.*]]: +; VF16UF1-NEXT: [[ENTRY:.*:]] ; VF16UF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; VF16UF1: [[VECTOR_PH]]: ; VF16UF1-NEXT: br label %[[VECTOR_BODY:.*]] @@ -246,10 +241,9 @@ define i64 @test_early_exit_max_tc_less_than_16_with_iv_used_outside(ptr derefer ; VF16UF1-NEXT: [[TMP5:%.*]] = add i64 0, [[FIRST_ACTIVE_LANE]] ; VF16UF1-NEXT: br label %[[EXIT]] ; VF16UF1: [[SCALAR_PH]]: -; VF16UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; VF16UF1-NEXT: br label %[[LOOP_HEADER:.*]] ; VF16UF1: [[LOOP_HEADER]]: -; VF16UF1-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; VF16UF1-NEXT: [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; VF16UF1-NEXT: [[P_SRC1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV1]] ; VF16UF1-NEXT: [[L:%.*]] = load i8, ptr [[P_SRC1]], align 1 ; VF16UF1-NEXT: [[C:%.*]] = icmp eq i8 [[L]], 0 diff --git a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll index 5f1cee887fda..ada59e90b881 100644 --- a/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll +++ b/llvm/test/Transforms/LoopVectorize/vector-loop-backedge-elimination-outside-iv-users.ll @@ -6,7 +6,7 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" define i64 @remove_loop_region_int_iv_used_outside(ptr %dst) { ; CHECK-LABEL: define i64 @remove_loop_region_int_iv_used_outside( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -18,10 +18,9 @@ define i64 @remove_loop_region_int_iv_used_outside(ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store ptr null, ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 @@ -50,7 +49,7 @@ exit: define i64 @remove_loop_region_int_iv_inc_used_outside(ptr %dst) { ; CHECK-LABEL: define i64 @remove_loop_region_int_iv_inc_used_outside( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -62,10 +61,9 @@ define i64 @remove_loop_region_int_iv_inc_used_outside(ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]] ; CHECK-NEXT: store ptr null, ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 @@ -94,7 +92,7 @@ exit: define ptr @remove_loop_region_ptr_iv_used_outside(ptr %dst) { ; CHECK-LABEL: define ptr @remove_loop_region_ptr_iv_used_outside( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 128 @@ -108,12 +106,10 @@ define ptr @remove_loop_region_ptr_iv_used_outside(ptr %dst) { ; CHECK-NEXT: [[IND_ESCAPE:%.*]] = getelementptr i8, ptr [[TMP0]], i64 -8 ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[INT_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[INT_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[DST]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[INT_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INT_IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: store ptr null, ptr [[PTR_IV]], align 8 ; CHECK-NEXT: [[INT_IV_NEXT]] = add i64 [[INT_IV]], 1 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8 @@ -143,7 +139,7 @@ exit: define ptr @remove_loop_region_ptr_iv_inc_used_outside(ptr %dst) { ; CHECK-LABEL: define ptr @remove_loop_region_ptr_iv_inc_used_outside( ; CHECK-SAME: ptr [[DST:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 128 @@ -156,12 +152,10 @@ define ptr @remove_loop_region_ptr_iv_inc_used_outside(ptr %dst) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[DST]], %[[ENTRY]] ] -; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[INT_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL1]], %[[SCALAR_PH]] ], [ [[INT_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[DST]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[INT_IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[INT_IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: store ptr null, ptr [[PTR_IV]], align 8 ; CHECK-NEXT: [[INT_IV_NEXT]] = add i64 [[INT_IV]], 1 ; CHECK-NEXT: [[PTR_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 8 diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll index e779233e6c67..cae5c4af1379 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -96,8 +96,7 @@ define void @iv_expand(ptr %p, i64 %n) { ; CHECK-NEXT: EMIT vp<[[BROADCAST_1:%.+]]> = broadcast ir<1> ; CHECK-NEXT: EMIT vp<[[MUL:%.+]]> = mul vp<[[STEP_VECTOR]]>, vp<[[BROADCAST_1]]> ; CHECK-NEXT: EMIT vp<[[INDUCTION:%.+]]> = add vp<[[BROADCAST_0]]>, vp<[[MUL]]> -; CHECK-NEXT: EMIT vp<[[INC:%.+]]> = mul ir<1>, ir<8> -; CHECK-NEXT: EMIT vp<[[BROADCAST_INC:%.+]]> = broadcast vp<[[INC]]> +; CHECK-NEXT: EMIT vp<[[BROADCAST_INC:%.+]]> = broadcast ir<8> ; CHECK-NEXT: Successor(s): vector.body ; CHECK-EMPTY: ; CHECK-NEXT: vector.body: diff --git a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll index 5d0d391e5b99..19cf1069f05b 100644 --- a/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll +++ b/llvm/test/Transforms/LoopVectorize/widen-gep-all-indices-invariant.ll @@ -22,10 +22,9 @@ define void @pr63340(ptr %A, ptr %B) { ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] ; CHECK: loop.header: -; CHECK-NEXT: [[IV:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] ; CHECK-NEXT: br label [[LOOP_LATCH]] ; CHECK: loop.latch: ; CHECK-NEXT: [[F_0_I:%.*]] = phi ptr [ [[A]], [[LOOP_HEADER]] ] @@ -80,10 +79,9 @@ define void @wide_gep_index_invariant(ptr noalias %dst, ptr noalias %src, i64 %n ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[SRC]], align 8 ; CHECK-NEXT: [[GEP_L:%.*]] = getelementptr float, ptr [[L]], i64 [[N]] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]] @@ -134,10 +132,9 @@ define void @wide_gep_multiple_indices_some_invariant(ptr noalias %dst, ptr noal ; CHECK: middle.block: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[SRC]], align 8 ; CHECK-NEXT: [[GEP_L:%.*]] = getelementptr [10 x float], ptr [[L]], i32 [[X]], i64 [[IV]] ; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr ptr, ptr [[DST]], i64 [[IV]] diff --git a/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll b/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll index c23d2b38659f..1cc2e871925b 100644 --- a/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll +++ b/llvm/test/Transforms/LoopVectorize/widen-intrinsic.ll @@ -5,7 +5,7 @@ define void @powi_only_first_lane_used_of_second_arg(ptr %p, i32 %pow) { ; CHECK-LABEL: define void @powi_only_first_lane_used_of_second_arg( ; CHECK-SAME: ptr [[P:%.*]], i32 [[POW:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] @@ -21,10 +21,9 @@ define void @powi_only_first_lane_used_of_second_arg(ptr %p, i32 %pow) { ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, %[[ENTRY]] ] ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] ; CHECK-NEXT: [[P_GEP:%.*]] = getelementptr float, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[X:%.*]] = load float, ptr [[P_GEP]], align 4 ; CHECK-NEXT: [[Y:%.*]] = call float @llvm.powi.f32.i32(float [[X]], i32 [[POW]]) diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll index 05674b9efc39..7175816963ed 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll @@ -94,7 +94,7 @@ define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data ; CHECK-NEXT: [[DST_ADDR_1]] = getelementptr inbounds nuw i8, ptr [[DST_ADDR_052]], i64 48 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT58]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END]], label %[[FOR_BODY]], !llvm.loop [[LOOP4]] ; CHECK: [[FOR_END]]: ; CHECK-NEXT: ret i32 0 ; @@ -801,8 +801,6 @@ attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) } !4 = distinct !{!4, !5} !5 = !{!"llvm.loop.mustprogress"} ;. -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]], [[META6:![0-9]+]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} ; CHECK: [[META5]] = !{!"llvm.loop.mustprogress"} -; CHECK: [[META6]] = !{!"llvm.loop.unswitch.nontrivial.disable"} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META5]]} ;. diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll index eec0b5c3e276..afe7d7498fc1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll @@ -926,53 +926,17 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[C]], align 4 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <16 x float> [[WIDE_VEC]], <16 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC23:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC24:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC25:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC26:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC27:%.*]] = shufflevector <16 x float> [[WIDE_VEC19]], <16 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[STRIDED_VEC20]], <2 x float> [[STRIDED_VEC21]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[STRIDED_VEC]], <2 x float> [[STRIDED_VEC12]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <4 x float> [[TMP8]], [[TMP1]] -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[TMP7]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[STRIDED_VEC22]], <2 x float> [[STRIDED_VEC23]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[STRIDED_VEC13]], <2 x float> [[STRIDED_VEC14]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = fmul fast <4 x float> [[TMP12]], [[TMP2]] -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[STRIDED_VEC24]], <2 x float> [[STRIDED_VEC25]], <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[STRIDED_VEC15]], <2 x float> [[STRIDED_VEC16]], <4 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = fmul fast <4 x float> [[TMP16]], [[TMP3]] -; CHECK-NEXT: [[TMP18:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x float> [[STRIDED_VEC26]], <2 x float> [[STRIDED_VEC27]], <4 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[STRIDED_VEC17]], <2 x float> [[STRIDED_VEC18]], <4 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <4 x float> [[TMP20]], [[TMP4]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd fast <4 x float> [[TMP19]], [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP14]], <8 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x float> [[TMP18]], <4 x float> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x float> [[TMP23]], <8 x float> [[TMP24]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]] ; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll index 798df4cd4ff5..63f8250b5f3d 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll @@ -404,13 +404,13 @@ define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) { ; SSE4-LABEL: @add_v16i16_FEuCBA98765432u0( ; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> +; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> ; SSE4-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]] -; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; SSE4-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> +; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> ; SSE4-NEXT: ret <16 x i16> [[RESULT]] ; ; AVX2-LABEL: @add_v16i16_FEuCBA98765432u0( diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll index fd160b7c5702..bbfe844400b0 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll @@ -398,13 +398,13 @@ define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) { ; SSE4-LABEL: @sub_v16i16_FEuCBA98765432u0( ; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> ; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> +; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> ; SSE4-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]] -; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> -; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> +; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> ; SSE4-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]] -; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> +; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> [[TMP6]], <16 x i32> ; SSE4-NEXT: ret <16 x i16> [[RESULT]] ; ; AVX2-LABEL: @sub_v16i16_FEuCBA98765432u0( diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll index 6f2833b4f4e7..f82d7309f6d0 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested.ll @@ -45,7 +45,7 @@ ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=false \ ; RUN: -passes='loop-mssa(licm,simple-loop-unswitch),print' -disable-output 2>&1 | \ -; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP6 +; RUN: sort -b -k 1 | FileCheck %s --check-prefixes=LOOP32 ; ; Single loop nest, not unswitched ; LOOP1: Loop at depth 1 containing: @@ -55,23 +55,23 @@ ; ; Half unswitched loop nests, with unscaled4 and div1 it gets less depth1 loops unswitched ; since they have more cost. -; LOOP-UNSCALE4-DIV1-COUNT-4: Loop at depth 1 containing: -; LOOP-UNSCALE4-DIV1-COUNT-4: Loop at depth 2 containing: -; LOOP-UNSCALE4-DIV1-COUNT-4: Loop at depth 3 containing: +; LOOP-UNSCALE4-DIV1-COUNT-6: Loop at depth 1 containing: +; LOOP-UNSCALE4-DIV1-COUNT-19: Loop at depth 2 containing: +; LOOP-UNSCALE4-DIV1-COUNT-29: Loop at depth 3 containing: ; LOOP-UNSCALE4-DIV1-NOT: Loop at depth {{[0-9]+}} containing: ; ; Half unswitched loop nests, with unscaled4 and div2 it gets more depth1 loops unswitched ; as div2 kicks in. -; LOOP-UNSCALE4-DIV2-COUNT-4: Loop at depth 1 containing: -; LOOP-UNSCALE4-DIV2-COUNT-4: Loop at depth 2 containing: -; LOOP-UNSCALE4-DIV2-COUNT-4: Loop at depth 3 containing: +; LOOP-UNSCALE4-DIV2-COUNT-11: Loop at depth 1 containing: +; LOOP-UNSCALE4-DIV2-COUNT-22: Loop at depth 2 containing: +; LOOP-UNSCALE4-DIV2-COUNT-29: Loop at depth 3 containing: ; LOOP-UNSCALE4-DIV2-NOT: Loop at depth {{[0-9]+}} containing: ; -; 6 loop nests, fully unswitched -; LOOP6-COUNT-6: Loop at depth 1 containing: -; LOOP6-COUNT-6: Loop at depth 2 containing: -; LOOP6-COUNT-6: Loop at depth 3 containing: -; LOOP6-NOT: Loop at depth {{[0-9]+}} containing: +; 32 loop nests, fully unswitched +; LOOP32-COUNT-32: Loop at depth 1 containing: +; LOOP32-COUNT-32: Loop at depth 2 containing: +; LOOP32-COUNT-32: Loop at depth 3 containing: +; LOOP32-NOT: Loop at depth {{[0-9]+}} containing: declare void @bar() diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll index ab3b3d26d997..63d2789da5a8 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch-nested2.ll @@ -60,7 +60,7 @@ ; ; Half unswitched loop nests, with unscaled3 and div1 it gets less depth1 loops unswitched ; since they have more cost. -; LOOP-UNSCALE3-DIV1-COUNT-2: Loop at depth 1 containing: +; LOOP-UNSCALE3-DIV1-COUNT-4: Loop at depth 1 containing: ; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 1 containing: ; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 2 containing: ; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 2 containing: @@ -69,7 +69,7 @@ ; ; Half unswitched loop nests, with unscaled3 and div2 it gets more depth1 loops unswitched ; as div2 kicks in. -; LOOP-UNSCALE3-DIV2-COUNT-2: Loop at depth 1 containing: +; LOOP-UNSCALE3-DIV2-COUNT-6: Loop at depth 1 containing: ; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 1 containing: ; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 2 containing: ; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 2 containing: @@ -77,7 +77,7 @@ ; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 3 containing: ; ; Maximally unswitched (copy of the outer loop per each condition) -; LOOP-MAX-COUNT-2: Loop at depth 1 containing: +; LOOP-MAX-COUNT-6: Loop at depth 1 containing: ; LOOP-MAX-NOT: Loop at depth 1 containing: ; LOOP-MAX-COUNT-1: Loop at depth 2 containing: ; LOOP-MAX-NOT: Loop at depth 2 containing: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll index 7515cbbcbf1d..a2a745f46bca 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-nontrivial-unswitch.ll @@ -25,37 +25,46 @@ ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \ -; RUN: -passes='loop(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP4 +; RUN: -passes='loop(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5 ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \ -; RUN: -passes='loop-mssa(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP4 +; RUN: -passes='loop-mssa(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5 +; +; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed +; siblings multiplier for top-level loops (toplevel-div == 8) we should get +; 2^(num conds) == 2^5 == 32 +; copies of the loop: ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \ -; RUN: -passes='loop(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6 +; RUN: -passes='loop(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32 ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=true \ ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \ -; RUN: -passes='loop-mssa(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6 +; RUN: -passes='loop-mssa(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32 +; +; Similarly get +; 2^(num conds) == 2^5 == 32 +; copies of the loop when cost multiplier is disabled: ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=false \ -; RUN: -passes='loop(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6 +; RUN: -passes='loop(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32 ; ; RUN: opt < %s -enable-unswitch-cost-multiplier=false \ -; RUN: -passes='loop-mssa(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP6 +; RUN: -passes='loop-mssa(simple-loop-unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32 ; ; Single loop, not unswitched ; LOOP1: Loop at depth 1 containing: ; LOOP1-NOT: Loop at depth 1 containing: -; 4 loops, unswitched 4 times -; LOOP4-COUNT-4: Loop at depth 1 containing: -; LOOP4-NOT: Loop at depth 1 containing: +; 5 loops, unswitched 4 times +; LOOP5-COUNT-5: Loop at depth 1 containing: +; LOOP5-NOT: Loop at depth 1 containing: -; 6 loops, fully unswitched -; LOOP6-COUNT-6: Loop at depth 1 containing: -; LOOP6-NOT: Loop at depth 1 containing: +; 32 loops, fully unswitched +; LOOP32-COUNT-32: Loop at depth 1 containing: +; LOOP32-NOT: Loop at depth 1 containing: define void @loop_simple5(ptr %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) { entry: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll index 846a7793b6c3..96fe899d69c3 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/exponential-switch-unswitch.ll @@ -61,19 +61,19 @@ ; Somewhat relaxed restrictions on candidates: ; LOOP-RELAX-COUNT-5: Loop at depth 1 containing: ; LOOP-RELAX-NOT: Loop at depth 1 containing: -; LOOP-RELAX-COUNT-5: Loop at depth 2 containing: +; LOOP-RELAX-COUNT-32: Loop at depth 2 containing: ; LOOP-RELAX-NOT: Loop at depth 2 containing: ; ; Even more relaxed restrictions on candidates and siblings. -; LOOP-RELAX2-COUNT-5: Loop at depth 1 containing: +; LOOP-RELAX2-COUNT-11: Loop at depth 1 containing: ; LOOP-RELAX2-NOT: Loop at depth 1 containing: -; LOOP-RELAX2-COUNT-5: Loop at depth 2 containing: +; LOOP-RELAX2-COUNT-40: Loop at depth 2 containing: ; LOOP-RELAX-NOT: Loop at depth 2 containing: ; ; Unswitched as much as it could (with multiplier disabled). -; LOOP-MAX-COUNT-6: Loop at depth 1 containing: +; LOOP-MAX-COUNT-56: Loop at depth 1 containing: ; LOOP-MAX-NOT: Loop at depth 1 containing: -; LOOP-MAX-COUNT-11: Loop at depth 2 containing: +; LOOP-MAX-COUNT-111: Loop at depth 2 containing: ; LOOP-MAX-NOT: Loop at depth 2 containing: define i32 @loop_switch(ptr %addr, i32 %c1, i32 %c2) { diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll index c77e7cce77a9..533b1f691f5a 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/guards.ll @@ -38,25 +38,25 @@ exit: } define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { -; CHECK-LABEL: define void @test_two_guards(i1 %cond1, i1 %cond2, i32 %N) { +; CHECK-LABEL: @test_two_guards( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %cond1, label %entry.split.us, label %entry.split +; CHECK-NEXT: br i1 [[COND1:%.*]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] ; CHECK: entry.split.us: -; CHECK-NEXT: br label %loop.us -; CHECK: loop.us: -; CHECK-NEXT: %iv.us = phi i32 [ 0, %entry.split.us ], [ %iv.next.us, %guarded.us ] -; CHECK-NEXT: br label %guarded.us -; CHECK: guarded.us: -; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %cond2) [ "deopt"() ] -; CHECK-NEXT: %iv.next.us = add i32 %iv.us, 1 -; CHECK-NEXT: %loop.cond.us = icmp slt i32 %iv.next.us, %N -; CHECK-NEXT: br i1 %loop.cond.us, label %loop.us, label %exit.split.us, !llvm.loop !2 -; CHECK: exit.split.us: -; CHECK-NEXT: br label %exit -; CHECK: entry.split: -; CHECK-NEXT: br label %loop -; CHECK: loop: -; CHECK-NEXT: br label %deopt +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_US_US:%.*]] +; CHECK: loop.us.us: +; CHECK-NEXT: [[IV_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[IV_NEXT_US_US:%.*]], [[GUARDED_US2:%.*]] ] +; CHECK-NEXT: br label [[GUARDED_US_US:%.*]] +; CHECK: guarded.us.us: +; CHECK-NEXT: br label [[GUARDED_US2]] +; CHECK: guarded.us2: +; CHECK-NEXT: [[IV_NEXT_US_US]] = add i32 [[IV_US_US]], 1 +; CHECK-NEXT: [[LOOP_COND_US_US:%.*]] = icmp slt i32 [[IV_NEXT_US_US]], [[N:%.*]] +; CHECK-NEXT: br i1 [[LOOP_COND_US_US]], label [[LOOP_US_US]], label [[EXIT_SPLIT_US_SPLIT_US:%.*]] +; CHECK: deopt1: +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] +; CHECK-NEXT: unreachable ; CHECK: deopt: ; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ] ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll index 3dc83203f149..536e0c6a0e74 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/inject-invariant-conditions.ll @@ -5,7 +5,7 @@ define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0:![0-9]+]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 [[LIMIT:%.*]], [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -20,7 +20,7 @@ define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -35,7 +35,7 @@ define i32 @test_01(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -76,7 +76,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_void_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_void_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] @@ -133,7 +133,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_constants( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 200, 300 ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -148,7 +148,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], 1000 -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -160,7 +160,7 @@ define i32 @test_01_constants(ptr noundef %p, ptr noundef %arr, ptr noundef %x_p ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -200,7 +200,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_degenerate_profile( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] @@ -210,7 +210,7 @@ define i32 @test_01_neg_degenerate_profile(ptr noundef %p, i32 noundef %n, i32 n ; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF8:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF5:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -257,7 +257,7 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_cold( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] @@ -267,7 +267,7 @@ define i32 @test_01_neg_cold(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF1]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF9:![0-9]+]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF6:![0-9]+]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -314,17 +314,17 @@ range_check_failed: ; preds = %guarded define i32 @test_01_neg_overflowing_metadata(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_01_neg_overflowing_metadata( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp ult i32 [[EL]], [[LIMIT:%.*]] -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF10:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[GUARDED:%.*]], label [[COMMON_RET:%.*]], !prof [[PROF7:![0-9]+]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] -; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF10]] +; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]], !prof [[PROF7]] ; CHECK: backedge: ; CHECK-NEXT: [[ARR_PTR:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL]] ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 @@ -371,7 +371,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -386,7 +386,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -401,7 +401,7 @@ define i32 @test_02(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -441,7 +441,7 @@ range_check_failed: ; preds = %guarded define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_02_inverse( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -456,7 +456,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] @@ -471,7 +471,7 @@ define i32 @test_02_inverse(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -511,7 +511,7 @@ range_check_failed: ; preds = %guarded define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_03( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 -2147483648, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -519,20 +519,20 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i32, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i32, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i32 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF15:![0-9]+]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10:![0-9]+]] ; CHECK: guarded.us: ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_US]], [[X]] ; CHECK-NEXT: [[ARR_PTR_US:%.*]] = getelementptr i32, ptr [[ARR:%.*]], i32 [[EL_US]] ; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i32, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i32 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF15]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] ; CHECK: guarded: ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL]], [[X]] ; CHECK-NEXT: br i1 [[RANGE_CHECK]], label [[BACKEDGE]], label [[COMMON_RET]] @@ -541,7 +541,7 @@ define i32 @test_03(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -581,7 +581,7 @@ range_check_failed: ; preds = %guarded define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noundef %arr, ptr noundef %x_p) { ; CHECK-LABEL: @test_04( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef [[META0]] +; CHECK-NEXT: [[X:%.*]] = load i32, ptr [[X_P:%.*]], align 4, !noundef !0 ; CHECK-NEXT: [[INJECTED_COND:%.*]] = icmp ule i32 128, [[X]] ; CHECK-NEXT: br i1 [[INJECTED_COND]], label [[LOOP_US:%.*]], label [[LOOP:%.*]] ; CHECK: loop.us: @@ -589,7 +589,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: [[EL_PTR_US:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 [[IV_US]] ; CHECK-NEXT: [[EL_US:%.*]] = load i8, ptr [[EL_PTR_US]], align 4 ; CHECK-NEXT: [[BOUND_CHECK_US:%.*]] = icmp slt i8 [[EL_US]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF15]] +; CHECK-NEXT: br i1 [[BOUND_CHECK_US]], label [[COMMON_RET:%.*]], label [[GUARDED_US]], !prof [[PROF10]] ; CHECK: guarded.us: ; CHECK-NEXT: [[EL_WIDE_US:%.*]] = zext i8 [[EL_US]] to i32 ; CHECK-NEXT: [[RANGE_CHECK_US:%.*]] = icmp ult i32 [[EL_WIDE_US]], [[X]] @@ -597,13 +597,13 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV_US]], ptr [[ARR_PTR_US]], align 4 ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 ; CHECK-NEXT: [[LOOP_COND_US:%.*]] = icmp slt i32 [[IV_NEXT_US]], [[N:%.*]] -; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND_US]], label [[LOOP_US]], label [[COMMON_RET]] ; CHECK: loop: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: [[EL_PTR:%.*]] = getelementptr i8, ptr [[P]], i32 [[IV]] ; CHECK-NEXT: [[EL:%.*]] = load i8, ptr [[EL_PTR]], align 4 ; CHECK-NEXT: [[BOUND_CHECK:%.*]] = icmp slt i8 [[EL]], 0 -; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF15]] +; CHECK-NEXT: br i1 [[BOUND_CHECK]], label [[COMMON_RET]], label [[GUARDED:%.*]], !prof [[PROF10]] ; CHECK: guarded: ; CHECK-NEXT: [[EL_WIDE:%.*]] = zext i8 [[EL]] to i32 ; CHECK-NEXT: [[RANGE_CHECK:%.*]] = icmp ult i32 [[EL_WIDE]], [[X]] @@ -613,7 +613,7 @@ define i32 @test_04(ptr noundef %p, i32 noundef %n, i32 noundef %limit, ptr noun ; CHECK-NEXT: store i32 [[IV]], ptr [[ARR_PTR]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[COMMON_RET]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: common.ret: ; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ 0, [[BACKEDGE]] ], [ 0, [[GUARDED_US]] ], [ -1, [[LOOP]] ], [ -1, [[LOOP_US]] ], [ -2, [[GUARDED]] ] ; CHECK-NEXT: ret i32 [[COMMON_RET_OP]] @@ -651,24 +651,17 @@ range_check_failed: ; preds = %guarded ret i32 -2 } ;. -; CHECK: [[META0]] = !{} +; CHECK: [[META0:![0-9]+]] = !{} ; CHECK: [[PROF1]] = !{!"branch_weights", i32 100, i32 1} -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]]} -; CHECK: [[META3]] = !{!"llvm.loop.unswitch.nontrivial.disable"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META5:![0-9]+]]} -; CHECK: [[META5]] = !{!"llvm.loop.unswitch.injection.disable"} -; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META3]]} -; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META5]]} -; CHECK: [[PROF8]] = !{!"branch_weights", i32 0, i32 0} -; CHECK: [[PROF9]] = !{!"branch_weights", i32 2, i32 3} -; CHECK: [[PROF10]] = !{!"branch_weights", i32 -1, i32 -1000} -; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META3]]} -; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META5]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]]} -; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META5]]} -; CHECK: [[PROF15]] = !{!"branch_weights", i32 1, i32 100} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META3]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META5]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META3]]} -; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[META5]]} +; CHECK: [[LOOP2]] = distinct !{!2, !3} +; CHECK: [[META3:![0-9]+]] = !{!"llvm.loop.unswitch.injection.disable"} +; CHECK: [[LOOP4]] = distinct !{!4, !3} +; CHECK: [[PROF5]] = !{!"branch_weights", i32 0, i32 0} +; CHECK: [[PROF6]] = !{!"branch_weights", i32 2, i32 3} +; CHECK: [[PROF7]] = !{!"branch_weights", i32 -1, i32 -1000} +; CHECK: [[LOOP8]] = distinct !{!8, !3} +; CHECK: [[LOOP9]] = distinct !{!9, !3} +; CHECK: [[PROF10]] = !{!"branch_weights", i32 1, i32 100} +; CHECK: [[LOOP11]] = distinct !{!11, !3} +; CHECK: [[LOOP12]] = distinct !{!12, !3} ;. diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll index 5f713fae9e96..fcef88667449 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/invalidate-block-and-loop-dispositions.ll @@ -14,17 +14,27 @@ define void @test_pr58136(i1 %c.1, i1 %c.2) { ; CHECK-NEXT: [[C_1_FR:%.*]] = freeze i1 [[C_1:%.*]] ; CHECK-NEXT: br i1 [[C_1_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] ; CHECK: entry.split.us: +; CHECK-NEXT: [[C_2_FR:%.*]] = freeze i1 [[C_2:%.*]] +; CHECK-NEXT: br i1 [[C_2_FR]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: ; CHECK-NEXT: br label [[LOOP_HEADER_US_US:%.*]] -; CHECK: loop.header.us: -; CHECK-NEXT: [[MUL1_US_US:%.*]] = phi i16 [ [[MUL_US_US:%.*]], [[LOOP_LATCH_US:%.*]] ], [ [[GLOB_PROMOTED]], [[ENTRY_SPLIT_US]] ] +; CHECK: loop.header.us.us: +; CHECK-NEXT: [[MUL1_US_US:%.*]] = phi i16 [ [[MUL_US_US:%.*]], [[LOOP_LATCH_US_US:%.*]] ], [ [[GLOB_PROMOTED]], [[ENTRY_SPLIT_US_SPLIT_US]] ] ; CHECK-NEXT: [[CALL2_US_US:%.*]] = call i16 @foo() -; CHECK-NEXT: br label [[LOOP_LATCH_US_US:%.*]] -; CHECK: then.bb.us: -; CHECK-NEXT: br i1 [[C_2:%.*]], label [[LOOP_LATCH_US]], label [[EXIT_SPLIT_US:%.*]] -; CHECK: loop.latch.us: +; CHECK-NEXT: br label [[THEN_BB_US_US:%.*]] +; CHECK: then.bb.us.us: +; CHECK-NEXT: br label [[LOOP_LATCH_US_US]] +; CHECK: loop.latch.us.us: ; CHECK-NEXT: [[MUL_US_US]] = mul nsw i16 [[MUL1_US_US]], [[L_3]] ; CHECK-NEXT: store i16 [[MUL_US_US]], ptr @glob, align 2 -; CHECK-NEXT: br label [[LOOP_HEADER_US_US]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_HEADER_US_US]] +; CHECK: entry.split.us.split: +; CHECK-NEXT: br label [[LOOP_HEADER_US:%.*]] +; CHECK: loop.header.us: +; CHECK-NEXT: [[CALL2_US:%.*]] = call i16 @foo() +; CHECK-NEXT: br label [[THEN_BB_US:%.*]] +; CHECK: then.bb.us: +; CHECK-NEXT: br label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -79,7 +89,7 @@ define void @test_pr58158(i1 %c.1) { ; CHECK: outer.loopexit.us: ; CHECK-NEXT: br label [[OUTER_BACKEDGE_US:%.*]] ; CHECK: outer.backedge.us: -; CHECK-NEXT: br label [[OUTER_US]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br label [[OUTER_US]] ; CHECK: entry.split: ; CHECK-NEXT: br label [[OUTER:%.*]] ; CHECK: outer: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll index d07c2fa4afd5..8e97cb5cb42f 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-freeze.ll @@ -32,7 +32,7 @@ define i32 @test1_freeze(ptr %ptr0, ptr %ptr1, ptr %ptr2) { ; CHECK-NEXT: br label [[LATCH_US:%.*]] ; CHECK: latch.us: ; CHECK-NEXT: [[V_US:%.*]] = load i1, ptr [[PTR0:%.*]], align 1 -; CHECK-NEXT: br i1 [[V_US]], label [[LOOP_BEGIN_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[V_US]], label [[LOOP_BEGIN_US]], label [[LOOP_EXIT_SPLIT_US:%.*]] ; CHECK: loop_exit.split.us: ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] ; CHECK: entry.split: @@ -50,7 +50,7 @@ define i32 @test1_freeze(ptr %ptr0, ptr %ptr1, ptr %ptr2) { ; CHECK-NEXT: br label [[LATCH_US2:%.*]] ; CHECK: latch.us2: ; CHECK-NEXT: [[V_US3:%.*]] = load i1, ptr [[PTR0]], align 1 -; CHECK-NEXT: br i1 [[V_US3]], label [[LOOP_BEGIN_US1]], label [[LOOP_EXIT_SPLIT_SPLIT_US:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[V_US3]], label [[LOOP_BEGIN_US1]], label [[LOOP_EXIT_SPLIT_SPLIT_US:%.*]] ; CHECK: loop_exit.split.split.us: ; CHECK-NEXT: br label [[LOOP_EXIT_SPLIT:%.*]] ; CHECK: entry.split.split: @@ -276,7 +276,7 @@ define i32 @test7b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) { ; CHECK-NEXT: [[V4_US:%.*]] = load i1, ptr [[PTR]], align 1 ; CHECK-NEXT: br i1 [[V4_US]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_D_US:%.*]] ; CHECK: inner_inner_loop_d.us: -; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]] ; CHECK: inner_inner_loop_exit.split.us: ; CHECK-NEXT: br label [[INNER_INNER_LOOP_EXIT]] ; CHECK: loop_exit.split.us: @@ -512,7 +512,7 @@ define i32 @test8b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) { ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 ; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_INNER_LOOP_LATCH_US:%.*]], label [[INNER_LOOP_EXIT_LOOPEXIT_SPLIT_US:%.*]] ; CHECK: inner_inner_loop_latch.us: -; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br label [[INNER_INNER_LOOP_BEGIN_US]] ; CHECK: inner_inner_loop_exit.split.us: ; CHECK-NEXT: br label [[INNER_INNER_LOOP_EXIT]] ; CHECK: inner_loop_exit.loopexit.split.us: @@ -614,7 +614,7 @@ define i32 @test10a(ptr %ptr, i1 %cond, ptr %a.ptr) { ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 ; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US_LOOPEXIT:%.*]], label [[LOOP_BEGIN_BACKEDGE_US:%.*]] ; CHECK: loop_begin.backedge.us: -; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_BEGIN_US]] ; CHECK: loop_exit.split.us.loopexit: ; CHECK-NEXT: [[A_LCSSA_US_PH:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ] ; CHECK-NEXT: br label [[LOOP_EXIT_SPLIT_US]] @@ -682,7 +682,7 @@ define i32 @test10b(ptr %ptr, i1 %cond, ptr %a.ptr) { ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 ; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_BEGIN_BACKEDGE_US]], label [[LOOP_EXIT_SPLIT_US:%.*]] ; CHECK: loop_begin.backedge.us: -; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_BEGIN_US]] ; CHECK: loop_exit.split.us: ; CHECK-NEXT: [[A_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ] ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] @@ -844,7 +844,7 @@ define i32 @test11b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) { ; CHECK-NEXT: br label [[INNER_LOOP_A_US:%.*]] ; CHECK: inner_loop_a.us: ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_LOOP_BEGIN_US]] ; CHECK: inner_loop_exit.split.us: ; CHECK-NEXT: [[A_INNER_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[INNER_LOOP_A_US]] ] ; CHECK-NEXT: br label [[INNER_LOOP_EXIT:%.*]] @@ -1033,7 +1033,7 @@ define i32 @test12b(ptr %ptr, ptr %cond.ptr, ptr %a.ptr, ptr %b.ptr) { ; CHECK-NEXT: br label [[INNER_INNER_LOOP_A_US:%.*]] ; CHECK: inner_inner_loop_a.us: ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 -; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_BEGIN_US]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[V2_US]], label [[INNER_INNER_LOOP_EXIT_SPLIT_US:%.*]], label [[INNER_INNER_LOOP_BEGIN_US]] ; CHECK: inner_inner_loop_exit.split.us: ; CHECK-NEXT: [[A_INNER_INNER_LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[INNER_INNER_LOOP_A_US]] ] ; CHECK-NEXT: br label [[INNER_INNER_LOOP_EXIT:%.*]] @@ -1142,7 +1142,7 @@ define i32 @test13a(ptr %ptr, i1 %cond, ptr %a.ptr, ptr %b.ptr) { ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 ; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US:%.*]], label [[LOOP_LATCH_US]] ; CHECK: loop_latch.us: -; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_BEGIN_US]] ; CHECK: loop_exit.split.us: ; CHECK-NEXT: [[LCSSA_US:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ] ; CHECK-NEXT: br label [[LOOP_EXIT:%.*]] @@ -1237,7 +1237,7 @@ define i32 @test13b(ptr %ptr, i1 %cond, ptr %a.ptr, ptr %b.ptr) { ; CHECK-NEXT: [[V2_US:%.*]] = load i1, ptr [[PTR]], align 1 ; CHECK-NEXT: br i1 [[V2_US]], label [[LOOP_EXIT_SPLIT_US_LOOPEXIT:%.*]], label [[LOOP_LATCH_US:%.*]] ; CHECK: loop_latch.us: -; CHECK-NEXT: br label [[LOOP_BEGIN_US]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_BEGIN_US]] ; CHECK: loop_exit.split.us.loopexit: ; CHECK-NEXT: [[LCSSA_US_PH:%.*]] = phi i32 [ [[A_US]], [[LOOP_A_US]] ] ; CHECK-NEXT: br label [[LOOP_EXIT_SPLIT_US]] @@ -1356,7 +1356,7 @@ define void @test23(i1 %arg, ptr %ptr) { ; CHECK-NEXT: br label [[OUTER_LATCH_US:%.*]] ; CHECK: outer.latch.us: ; CHECK-NEXT: [[OUTER_COND_US:%.*]] = load i1, ptr [[PTR]], align 1 -; CHECK-NEXT: br i1 [[OUTER_COND_US]], label [[OUTER_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[OUTER_COND_US]], label [[OUTER_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -1426,10 +1426,10 @@ define i32 @test29(i32 %arg) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARG_FR:%.*]] = freeze i32 [[ARG:%.*]] ; CHECK-NEXT: switch i32 [[ARG_FR]], label [[ENTRY_SPLIT:%.*]] [ -; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]] -; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US]] -; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1:%.*]] -; CHECK-NEXT: i32 3, label [[ENTRY_SPLIT]] +; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]] +; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US]] +; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1:%.*]] +; CHECK-NEXT: i32 3, label [[ENTRY_SPLIT]] ; CHECK-NEXT: ] ; CHECK: entry.split.us: ; CHECK-NEXT: br label [[HEADER_US:%.*]] @@ -1456,7 +1456,7 @@ define i32 @test29(i32 %arg) { ; CHECK-NEXT: br label [[LATCH_US:%.*]] ; CHECK: latch.us: ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp slt i32 [[TMP_C_SUM_US]], 42 -; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: [[LCSSA_PHI_US:%.*]] = phi i32 [ [[TMP_C_SUM_US]], [[LATCH_US]] ] ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -1485,7 +1485,7 @@ define i32 @test29(i32 %arg) { ; CHECK-NEXT: br label [[LATCH_US18:%.*]] ; CHECK: latch.us18: ; CHECK-NEXT: [[CMP2_US19:%.*]] = icmp slt i32 [[TMP_C_SUM_US17]], 42 -; CHECK-NEXT: br i1 [[CMP2_US19]], label [[HEADER_US2]], label [[EXIT_SPLIT_SPLIT_US:%.*]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP2_US19]], label [[HEADER_US2]], label [[EXIT_SPLIT_SPLIT_US:%.*]] ; CHECK: exit.split.split.us: ; CHECK-NEXT: [[LCSSA_PHI_US20:%.*]] = phi i32 [ [[TMP_C_SUM_US17]], [[LATCH_US18]] ] ; CHECK-NEXT: br label [[EXIT_SPLIT:%.*]] @@ -1587,10 +1587,10 @@ define i32 @test30(i32 %arg) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARG_FR:%.*]] = freeze i32 [[ARG:%.*]] ; CHECK-NEXT: switch i32 [[ARG_FR]], label [[ENTRY_SPLIT:%.*]] [ -; CHECK-NEXT: i32 -1, label [[ENTRY_SPLIT]] -; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]] -; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US1:%.*]] -; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1]] +; CHECK-NEXT: i32 -1, label [[ENTRY_SPLIT]] +; CHECK-NEXT: i32 0, label [[ENTRY_SPLIT_US:%.*]] +; CHECK-NEXT: i32 1, label [[ENTRY_SPLIT_US1:%.*]] +; CHECK-NEXT: i32 2, label [[ENTRY_SPLIT_US1]] ; CHECK-NEXT: ] ; CHECK: entry.split.us: ; CHECK-NEXT: br label [[HEADER_US:%.*]] @@ -1612,7 +1612,7 @@ define i32 @test30(i32 %arg) { ; CHECK-NEXT: br label [[LATCH_US:%.*]] ; CHECK: latch.us: ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp slt i32 [[TMP_B_SUM_US]], 42 -; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[LOOP_EXIT2_SPLIT_US:%.*]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP2_US]], label [[HEADER_US]], label [[LOOP_EXIT2_SPLIT_US:%.*]] ; CHECK: loop.exit2.split.us: ; CHECK-NEXT: [[L2_PHI_US:%.*]] = phi i32 [ [[TMP_B_SUM_US]], [[LATCH_US]] ] ; CHECK-NEXT: br label [[LOOP_EXIT2:%.*]] @@ -1636,7 +1636,7 @@ define i32 @test30(i32 %arg) { ; CHECK-NEXT: br label [[LATCH_US14:%.*]] ; CHECK: latch.us14: ; CHECK-NEXT: [[CMP2_US15:%.*]] = icmp slt i32 [[TMP_B_SUM_US13]], 42 -; CHECK-NEXT: br i1 [[CMP2_US15]], label [[HEADER_US2]], label [[LOOP_EXIT2_SPLIT_SPLIT_US:%.*]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP2_US15]], label [[HEADER_US2]], label [[LOOP_EXIT2_SPLIT_SPLIT_US:%.*]] ; CHECK: loop.exit2.split.split.us: ; CHECK-NEXT: [[L2_PHI_US16:%.*]] = phi i32 [ [[TMP_B_SUM_US13]], [[LATCH_US14]] ] ; CHECK-NEXT: br label [[LOOP_EXIT2_SPLIT:%.*]] @@ -2259,9 +2259,9 @@ define void @hoist_inner_loop_switch(ptr %ptr) { ; CHECK-NEXT: [[V1:%.*]] = call i32 @cond.i32() ; CHECK-NEXT: [[V1_FR:%.*]] = freeze i32 [[V1]] ; CHECK-NEXT: switch i32 [[V1_FR]], label [[B_HEADER_SPLIT:%.*]] [ -; CHECK-NEXT: i32 1, label [[B_HEADER_SPLIT_US:%.*]] -; CHECK-NEXT: i32 2, label [[B_HEADER_SPLIT_US]] -; CHECK-NEXT: i32 3, label [[B_HEADER_SPLIT_US]] +; CHECK-NEXT: i32 1, label [[B_HEADER_SPLIT_US:%.*]] +; CHECK-NEXT: i32 2, label [[B_HEADER_SPLIT_US]] +; CHECK-NEXT: i32 3, label [[B_HEADER_SPLIT_US]] ; CHECK-NEXT: ] ; CHECK: b.header.split.us: ; CHECK-NEXT: br label [[C_HEADER_US:%.*]] diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll index 64b18291b22d..c86fa349200c 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch-select.ll @@ -28,7 +28,7 @@ define i32 @basic(i32 %N, i1 %cond, i32 %select_input) { ; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[SELECT_INPUT]], [[TMP0]] ] ; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US]], [[RES_US]] ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1 -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US]] ; CHECK: for.cond.cleanup.split.us: ; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] @@ -132,7 +132,7 @@ define i32 @select_phi_input(i32 %N, i1 %cond) { ; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP0]] ] ; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US]], [[RES_US]] ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1 -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US]] ; CHECK: for.cond.cleanup.split.us: ; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] @@ -195,7 +195,7 @@ define i32 @basic_cond_noundef(i32 %N, i1 noundef %cond) { ; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP0]] ] ; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US]], [[RES_US]] ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1 -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US]] ; CHECK: for.cond.cleanup.split.us: ; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] @@ -285,24 +285,55 @@ define i32 @chained_select(i32 %N, i1 %cond, i1 %cond2) { ; CHECK-NEXT: [[COND_FR:%.*]] = freeze i1 [[COND]] ; CHECK-NEXT: br i1 [[COND_FR]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] ; CHECK: entry.split.us: -; CHECK-NEXT: br label [[FOR_COND_US:%.*]] -; CHECK: for.cond.us: -; CHECK-NEXT: [[RES_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[ADD_US:%.*]], [[TMP1:%.*]] ] -; CHECK-NEXT: [[I_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US]] ], [ [[INC_US:%.*]], [[TMP1]] ] -; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i32 [[I_US]], [[N]] -; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_BODY_US:%.*]], label [[FOR_COND_CLEANUP_SPLIT_US:%.*]] -; CHECK: for.body.us: +; CHECK-NEXT: [[COND2_FR13:%.*]] = freeze i1 [[COND2]] +; CHECK-NEXT: br i1 [[COND2_FR13]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND_US_US:%.*]] +; CHECK: for.cond.us.us: +; CHECK-NEXT: [[RES_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[ADD_US_US:%.*]], [[TMP3:%.*]] ] +; CHECK-NEXT: [[I_US_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT_US]] ], [ [[INC_US_US:%.*]], [[TMP3]] ] +; CHECK-NEXT: [[CMP_US_US:%.*]] = icmp slt i32 [[I_US_US]], [[N]] +; CHECK-NEXT: br i1 [[CMP_US_US]], label [[FOR_BODY_US_US:%.*]], label [[FOR_COND_CLEANUP_SPLIT_US_SPLIT_US:%.*]] +; CHECK: for.body.us.us: ; CHECK-NEXT: br label [[TMP0:%.*]] ; CHECK: 0: -; CHECK-NEXT: br label [[TMP1]] +; CHECK-NEXT: br label [[TMP1:%.*]] ; CHECK: 1: -; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP0]] ] -; CHECK-NEXT: [[SELECT2_US:%.*]] = select i1 [[COND2]], i32 [[UNSWITCHED_SELECT_US]], i32 24 -; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[SELECT2_US]], [[RES_US]] +; CHECK-NEXT: [[UNSWITCHED_SELECT_US_US:%.*]] = phi i32 [ [[I_US_US]], [[TMP0]] ] +; CHECK-NEXT: br label [[TMP2:%.*]] +; CHECK: 2: +; CHECK-NEXT: br label [[TMP3]] +; CHECK: 3: +; CHECK-NEXT: [[UNSWITCHED_SELECT_US11:%.*]] = phi i32 [ [[UNSWITCHED_SELECT_US_US]], [[TMP2]] ] +; CHECK-NEXT: [[ADD_US_US]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US11]], [[RES_US_US]] +; CHECK-NEXT: [[INC_US_US]] = add nuw nsw i32 [[I_US_US]], 1 +; CHECK-NEXT: br label [[FOR_COND_US_US]] +; CHECK: for.cond.cleanup.split.us.split.us: +; CHECK-NEXT: [[RES_LCSSA_US_US:%.*]] = phi i32 [ [[RES_US_US]], [[FOR_COND_US_US]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_SPLIT_US:%.*]] +; CHECK: entry.split.us.split: +; CHECK-NEXT: br label [[FOR_COND_US:%.*]] +; CHECK: for.cond.us: +; CHECK-NEXT: [[RES_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT]] ], [ [[ADD_US:%.*]], [[TMP6:%.*]] ] +; CHECK-NEXT: [[I_US:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_US_SPLIT]] ], [ [[INC_US:%.*]], [[TMP6]] ] +; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i32 [[I_US]], [[N]] +; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_BODY_US:%.*]], label [[FOR_COND_CLEANUP_SPLIT_US_SPLIT:%.*]] +; CHECK: for.body.us: +; CHECK-NEXT: br label [[TMP4:%.*]] +; CHECK: 4: +; CHECK-NEXT: br label [[TMP5:%.*]] +; CHECK: 5: +; CHECK-NEXT: [[UNSWITCHED_SELECT_US:%.*]] = phi i32 [ [[I_US]], [[TMP4]] ] +; CHECK-NEXT: br label [[TMP6]] +; CHECK: 6: +; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 24, [[RES_US]] ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1 -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.cond.cleanup.split.us: +; CHECK-NEXT: br label [[FOR_COND_US]] +; CHECK: for.cond.cleanup.split.us.split: ; CHECK-NEXT: [[RES_LCSSA_US:%.*]] = phi i32 [ [[RES_US]], [[FOR_COND_US]] ] +; CHECK-NEXT: br label [[FOR_COND_CLEANUP_SPLIT_US]] +; CHECK: for.cond.cleanup.split.us: +; CHECK-NEXT: [[DOTUS_PHI12:%.*]] = phi i32 [ [[RES_LCSSA_US]], [[FOR_COND_CLEANUP_SPLIT_US_SPLIT]] ], [ [[RES_LCSSA_US_US]], [[FOR_COND_CLEANUP_SPLIT_US_SPLIT_US]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]] ; CHECK: entry.split: ; CHECK-NEXT: [[COND2_FR:%.*]] = freeze i1 [[COND2]] @@ -310,36 +341,36 @@ define i32 @chained_select(i32 %N, i1 %cond, i1 %cond2) { ; CHECK: entry.split.split.us: ; CHECK-NEXT: br label [[FOR_COND_US1:%.*]] ; CHECK: for.cond.us1: -; CHECK-NEXT: [[RES_US2:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[ADD_US7:%.*]], [[TMP4:%.*]] ] -; CHECK-NEXT: [[I_US3:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[INC_US8:%.*]], [[TMP4]] ] +; CHECK-NEXT: [[RES_US2:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[ADD_US7:%.*]], [[TMP9:%.*]] ] +; CHECK-NEXT: [[I_US3:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT_US]] ], [ [[INC_US8:%.*]], [[TMP9]] ] ; CHECK-NEXT: [[CMP_US4:%.*]] = icmp slt i32 [[I_US3]], [[N]] ; CHECK-NEXT: br i1 [[CMP_US4]], label [[FOR_BODY_US5:%.*]], label [[FOR_COND_CLEANUP_SPLIT_SPLIT_US:%.*]] ; CHECK: for.body.us5: -; CHECK-NEXT: br label [[TMP2:%.*]] -; CHECK: 2: -; CHECK-NEXT: br label [[TMP3:%.*]] -; CHECK: 3: -; CHECK-NEXT: br label [[TMP4]] -; CHECK: 4: -; CHECK-NEXT: [[UNSWITCHED_SELECT_US6:%.*]] = phi i32 [ 42, [[TMP3]] ] +; CHECK-NEXT: br label [[TMP7:%.*]] +; CHECK: 7: +; CHECK-NEXT: br label [[TMP8:%.*]] +; CHECK: 8: +; CHECK-NEXT: br label [[TMP9]] +; CHECK: 9: +; CHECK-NEXT: [[UNSWITCHED_SELECT_US6:%.*]] = phi i32 [ 42, [[TMP8]] ] ; CHECK-NEXT: [[ADD_US7]] = add nuw nsw i32 [[UNSWITCHED_SELECT_US6]], [[RES_US2]] ; CHECK-NEXT: [[INC_US8]] = add nuw nsw i32 [[I_US3]], 1 -; CHECK-NEXT: br label [[FOR_COND_US1]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US1]] ; CHECK: for.cond.cleanup.split.split.us: ; CHECK-NEXT: [[RES_LCSSA_US9:%.*]] = phi i32 [ [[RES_US2]], [[FOR_COND_US1]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_SPLIT:%.*]] ; CHECK: entry.split.split: ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: -; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[ADD:%.*]], [[TMP6:%.*]] ] -; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[INC:%.*]], [[TMP6]] ] +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[ADD:%.*]], [[TMP11:%.*]] ] +; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY_SPLIT_SPLIT]] ], [ [[INC:%.*]], [[TMP11]] ] ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP_SPLIT_SPLIT:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br label [[TMP5:%.*]] -; CHECK: 5: -; CHECK-NEXT: br label [[TMP6]] -; CHECK: 6: +; CHECK-NEXT: br label [[TMP10:%.*]] +; CHECK: 10: +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: ; CHECK-NEXT: [[ADD]] = add nuw nsw i32 24, [[RES]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I]], 1 ; CHECK-NEXT: br label [[FOR_COND]] @@ -350,7 +381,7 @@ define i32 @chained_select(i32 %N, i1 %cond, i1 %cond2) { ; CHECK-NEXT: [[DOTUS_PHI10:%.*]] = phi i32 [ [[RES_LCSSA]], [[FOR_COND_CLEANUP_SPLIT_SPLIT]] ], [ [[RES_LCSSA_US9]], [[FOR_COND_CLEANUP_SPLIT_SPLIT_US]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] ; CHECK: for.cond.cleanup: -; CHECK-NEXT: [[DOTUS_PHI:%.*]] = phi i32 [ [[DOTUS_PHI10]], [[FOR_COND_CLEANUP_SPLIT]] ], [ [[RES_LCSSA_US]], [[FOR_COND_CLEANUP_SPLIT_US]] ] +; CHECK-NEXT: [[DOTUS_PHI:%.*]] = phi i32 [ [[DOTUS_PHI10]], [[FOR_COND_CLEANUP_SPLIT]] ], [ [[DOTUS_PHI12]], [[FOR_COND_CLEANUP_SPLIT_US]] ] ; CHECK-NEXT: ret i32 [[DOTUS_PHI]] ; entry: @@ -396,7 +427,7 @@ define i32 @select_in_if(i32 %N, i1 %cond) { ; CHECK-NEXT: [[P_US:%.*]] = phi i32 [ [[UNSWITCHED_SELECT_US:%.*]], [[TMP1:%.*]] ], [ 24, [[FOR_BODY_US]] ] ; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[P_US]], [[RES_US]] ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1 -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US]] ; CHECK: 0: ; CHECK-NEXT: br label [[TMP1]] ; CHECK: 1: @@ -486,7 +517,7 @@ define i32 @select_in_if_else(i32 %N, i1 %cond) { ; CHECK-NEXT: [[P_US:%.*]] = phi i32 [ [[COND1A_US]], [[FOR_BODY_IF_US]] ], [ [[UNSWITCHED_SELECT_US:%.*]], [[TMP1:%.*]] ] ; CHECK-NEXT: [[ADD_US]] = add nuw nsw i32 [[P_US]], [[RES_US]] ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_US]], 1 -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US]] ; CHECK: 0: ; CHECK-NEXT: br label [[TMP1]] ; CHECK: 1: @@ -575,7 +606,7 @@ define dso_local void @select_nested_loop(i1 noundef zeroext %cond, i32 noundef ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.us: ; CHECK-NEXT: [[INC7_US_US]] = add nuw i32 [[I_018_US_US]], 1 ; CHECK-NEXT: [[EXITCOND21_NOT_US:%.*]] = icmp eq i32 [[INC7_US_US]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND21_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US_US]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND21_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_COND1_PREHEADER_US_US]] ; CHECK: for.cond1.preheader.us.split.us.us: ; CHECK-NEXT: br label [[FOR_BODY4_US_US_US:%.*]] ; CHECK: for.body4.us.us.us: @@ -588,7 +619,7 @@ define dso_local void @select_nested_loop(i1 noundef zeroext %cond, i32 noundef ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US_US]]) ; CHECK-NEXT: [[INC_US_US_US]] = add nuw i32 [[J_016_US_US_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US_US:%.*]] = icmp eq i32 [[INC_US_US_US]], [[M]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US_US]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US_US:%.*]], label [[FOR_BODY4_US_US_US]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.split.us.us: ; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_US]] ; CHECK: for.cond.cleanup.loopexit.split.us: @@ -676,7 +707,7 @@ define dso_local void @select_invariant_outer_loop(i1 noundef zeroext %cond, i32 ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]]) ; CHECK-NEXT: [[INC_US_US]] = add nuw i32 [[J_019_US_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US_US]], [[M]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US:%.*]], label [[FOR_BODY4_US_US]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_SPLIT_US:%.*]], label [[FOR_BODY4_US_US]] ; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us.split.us: ; CHECK-NEXT: br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ; CHECK: for.cond1.preheader.us.split: @@ -751,7 +782,7 @@ define dso_local i32 @trivial_select_cond(i32 noundef %n, i32 noundef %a, i32 no ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]]) ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_03_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]] ; CHECK: for.cond.cleanup.loopexit.split.us: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.body.preheader.split: @@ -808,7 +839,7 @@ define i32 @and_lhs_invariant(i32 %num, i1 %cond) { ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]]) ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]] ; CHECK: for.cond.cleanup.loopexit.split.us: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.body.preheader.split: @@ -873,7 +904,7 @@ define i32 @and_rhs_invariant(i32 %num, i1 %cond) { ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]]) ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]] ; CHECK: for.cond.cleanup.loopexit.split.us: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.body.preheader.split: @@ -940,7 +971,7 @@ define i32 @or_lhs_invariant(i32 %num, i1 %cond) { ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]]) ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]] ; CHECK: for.cond.cleanup.loopexit.split.us: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.body.preheader.split: @@ -1007,7 +1038,7 @@ define i32 @or_rhs_invariant(i32 %num, i1 %cond) { ; CHECK-NEXT: tail call void @bar(i32 noundef [[UNSWITCHED_SELECT_US]]) ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[I_07_US]], 1 ; CHECK-NEXT: [[EXITCOND_NOT_US:%.*]] = icmp eq i32 [[INC_US]], [[NUM]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT_US]], label [[FOR_COND_CLEANUP_LOOPEXIT_SPLIT_US:%.*]], label [[FOR_BODY_US]] ; CHECK: for.cond.cleanup.loopexit.split.us: ; CHECK-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ; CHECK: for.body.preheader.split: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll index 36f7a9e8cd65..9567b6b79323 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/nontrivial-unswitch.ll @@ -2626,45 +2626,66 @@ loop_a: ; The second unswitched condition. ; ; CHECK: entry.split.us: -; CHECK-NEXT: br label %loop_begin.us +; CHECK-NEXT: br i1 %cond2, label %entry.split.us.split.us, label %entry.split.us.split loop_a_a: call i32 @a() br label %latch ; The 'loop_a_a' unswitched loop. ; -; CHECK: loop_begin.us: -; CHECK-NEXT: br label %loop_a.us +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label %loop_begin.us.us ; -; CHECK: loop_a.us: -; CHECK-NEXT: br i1 %cond2, label %loop_a_a.us, label %loop_a_c.us +; CHECK: loop_begin.us.us: +; CHECK-NEXT: br label %loop_a.us.us ; -; The 'loop_a_c' unswitched loop. +; CHECK: loop_a.us.us: +; CHECK-NEXT: br label %loop_a_a.us.us ; -; CHECK: loop_a_c.us: -; CHECK-NEXT: call i32 @c() -; CHECK-NEXT: br label %latch.us -; -; CHECK: loop_a_a.us: +; CHECK: loop_a_a.us.us: ; CHECK-NEXT: call i32 @a() -; CHECK-NEXT: br label %latch.us +; CHECK-NEXT: br label %latch.us.us ; -; CHECK: latch.us: +; CHECK: latch.us.us: ; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr -; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us, label %loop_exit.split.us, !llvm.loop !22 +; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us.us, label %loop_exit.split.us.split.us ; -; CHECK: loop_exit.split.us -; CHECK-NEXT: br label %loop_exit +; CHECK: loop_exit.split.us.split.us: +; CHECK-NEXT: br label %loop_exit.split loop_a_c: call i32 @c() br label %latch +; The 'loop_a_c' unswitched loop. +; +; CHECK: entry.split.us.split: +; CHECK-NEXT: br label %loop_begin.us +; +; CHECK: loop_begin.us: +; CHECK-NEXT: br label %loop_a.us +; +; CHECK: loop_a.us: +; CHECK-NEXT: br label %loop_a_c.us +; +; CHECK: loop_a_c.us: +; CHECK-NEXT: call i32 @c() +; CHECK-NEXT: br label %latch +; +; CHECK: latch.us: +; CHECK-NEXT: %[[V:.*]] = load i1, ptr %ptr +; CHECK-NEXT: br i1 %[[V]], label %loop_begin.us, label %loop_exit.split.us.split +; +; CHECK: loop_exit.split.us.split: +; CHECK-NEXT: br label %loop_exit.split loop_b: call i32 @b() br label %latch ; The 'loop_b' unswitched loop. ; +; CHECK: entry.split: +; CHECK-NEXT: br label %loop_begin +; ; CHECK: loop_begin: ; CHECK-NEXT: br label %loop_b ; @@ -2964,9 +2985,9 @@ loop_a: ; ; CHECK: [[LOOP_LATCH_A]]: ; CHECK-NEXT: %[[V_A:.*]] = load i1, ptr %ptr -; CHECK: br i1 %[[V_A]], label %loop_begin.us, label %loop_exit.split.us, !llvm.loop !26 +; CHECK: br i1 %[[V_A]], label %[[LOOP_BEGIN_A]], label %[[LOOP_EXIT_A:.*]] ; -; CHECK: loop_exit.split.us: +; CHECK: [[LOOP_EXIT_A]]: ; CHECK-NEXT: br label %loop_exit loop_b: @@ -2986,10 +3007,10 @@ loop_b: ; ; CHECK: [[LOOP_LATCH_B]]: ; CHECK-NEXT: %[[V_B:.*]] = load i1, ptr %ptr -; CHECK: br i1 %[[V_B]], label %loop_begin.us2, label %loop_exit.split.split.us, !llvm.loop !27 +; CHECK: br i1 %[[V_B]], label %[[LOOP_BEGIN_B]], label %[[LOOP_EXIT_B:.*]] ; -; CHECK: loop_exit.split.split.us: -; CHECK-NEXT: br label %loop_exit.split +; CHECK: [[LOOP_EXIT_B]]: +; CHECK-NEXT: br label %loop_exit loop_c: call i32 @c() @@ -3008,10 +3029,10 @@ loop_c: ; ; CHECK: [[LOOP_LATCH_C]]: ; CHECK-NEXT: %[[V_C:.*]] = load i1, ptr %ptr -; CHECK: br i1 %[[V_C]], label %loop_begin.us6, label %loop_exit.split.split.split.us, !llvm.loop !28 +; CHECK: br i1 %[[V_C]], label %[[LOOP_BEGIN_C]], label %[[LOOP_EXIT_C:.*]] ; -; CHECK: loop_exit.split.split.split.us: -; CHECK-NEXT: br label %loop_exit.split.split +; CHECK: [[LOOP_EXIT_C]]: +; CHECK-NEXT: br label %loop_exit latch: %v = load i1, ptr %ptr @@ -3111,9 +3132,9 @@ body.a: ; ; CHECK: [[LATCH_A]]: ; CHECK-NEXT: %[[CMP2_A:.*]] = icmp slt i32 %[[TMP_C_SUM_A]], 42 -; CHECK: br i1 %[[CMP2_A]], label %header.us, label %exit.split.us, !llvm.loop !29 +; CHECK: br i1 %[[CMP2_A]], label %[[HEADER_A]], label %[[LOOP_EXIT_A:.*]] ; -; CHECK: exit.split.us: +; CHECK: [[LOOP_EXIT_A]]: ; CHECK-NEXT: %[[LCSSA_A:.*]] = phi i32 [ %[[TMP_C_SUM_A]], %[[LATCH_A]] ] ; CHECK-NEXT: br label %exit @@ -3155,9 +3176,9 @@ body.b: ; ; CHECK: [[LATCH_B]]: ; CHECK-NEXT: %[[CMP2_B:.*]] = icmp slt i32 %[[TMP_C_SUM_B]], 42 -; CHECK: br i1 %[[CMP2_B]], label %header.us2, label %exit.split.split.us, !llvm.loop !30 +; CHECK: br i1 %[[CMP2_B]], label %[[HEADER_B]], label %[[LOOP_EXIT_B:.*]] ; -; CHECK: exit.split.split.us: +; CHECK: [[LOOP_EXIT_B]]: ; CHECK-NEXT: %[[LCSSA_B:.*]] = phi i32 [ %[[TMP_C_SUM_B]], %[[LATCH_B]] ] ; CHECK-NEXT: br label %[[EXIT_SPLIT:.*]] @@ -3213,11 +3234,11 @@ exit: %lcssa.phi = phi i32 [ %tmp.c.sum, %latch ] ret i32 %lcssa.phi ; CHECK: [[EXIT_SPLIT]]: -; CHECK-NEXT: %[[EXIT_PHI1:.*]] = phi i32 [ %[[LCSSA_C]], %[[LOOP_EXIT_C]] ], [ %[[LCSSA_B]], %exit.split.split.us ] +; CHECK-NEXT: %[[EXIT_PHI1:.*]] = phi i32 [ %[[LCSSA_C]], %[[LOOP_EXIT_C]] ], [ %[[LCSSA_B]], %[[LOOP_EXIT_B]] ] ; CHECK-NEXT: br label %exit ; CHECK: exit: -; CHECK-NEXT: %[[EXIT_PHI2:.*]] = phi i32 [ %[[EXIT_PHI1]], %[[EXIT_SPLIT]] ], [ %[[LCSSA_A]], %exit.split.us ] +; CHECK-NEXT: %[[EXIT_PHI2:.*]] = phi i32 [ %[[EXIT_PHI1]], %[[EXIT_SPLIT]] ], [ %[[LCSSA_A]], %[[LOOP_EXIT_A]] ] ; CHECK-NEXT: ret i32 %[[EXIT_PHI2]] } @@ -3283,9 +3304,9 @@ body.a: ; ; CHECK: [[LATCH_A]]: ; CHECK-NEXT: %[[CMP2_A:.*]] = icmp slt i32 %[[TMP_B_SUM_A]], 42 -; CHECK: br i1 %[[CMP2_A]], label %header.us, label %loop.exit2.split.us, !llvm.loop !31 +; CHECK: br i1 %[[CMP2_A]], label %[[HEADER_A]], label %[[LOOP_EXIT_A:.*]] ; -; CHECK: loop.exit2.split.us: +; CHECK: [[LOOP_EXIT_A]]: ; CHECK-NEXT: %[[LCSSA_A:.*]] = phi i32 [ %[[TMP_B_SUM_A]], %[[LATCH_A]] ] ; CHECK-NEXT: br label %loop.exit2 @@ -3321,9 +3342,9 @@ body.b: ; ; CHECK: [[LATCH_B]]: ; CHECK-NEXT: %[[CMP2_B:.*]] = icmp slt i32 %[[TMP_B_SUM_B]], 42 -; CHECK: br i1 %[[CMP2_B]], label %header.us2, label %loop.exit2.split.split.us, !llvm.loop !32 +; CHECK: br i1 %[[CMP2_B]], label %[[HEADER_B]], label %[[LOOP_EXIT_B:.*]] ; -; CHECK: loop.exit2.split.split.us: +; CHECK: [[LOOP_EXIT_B]]: ; CHECK-NEXT: %[[LCSSA_B:.*]] = phi i32 [ %[[TMP_B_SUM_B]], %[[LATCH_B]] ] ; CHECK-NEXT: br label %[[LOOP_EXIT2_SPLIT:.*]] @@ -3376,11 +3397,11 @@ loop.exit2: %l2.phi = phi i32 [ %tmp.b.sum, %latch ] br label %exit ; CHECK: [[LOOP_EXIT2_SPLIT]]: -; CHECK-NEXT: %[[LOOP_EXIT_PHI1:.*]] = phi i32 [ %[[L2_PHI]], %[[LOOP_EXIT_EXIT]] ], [ %[[LCSSA_B]], %loop.exit2.split.split.us ] +; CHECK-NEXT: %[[LOOP_EXIT_PHI1:.*]] = phi i32 [ %[[L2_PHI]], %[[LOOP_EXIT_EXIT]] ], [ %[[LCSSA_B]], %[[LOOP_EXIT_B]] ] ; CHECK-NEXT: br label %loop.exit2 ; ; CHECK: loop.exit2: -; CHECK-NEXT: %[[LOOP_EXIT_PHI2:.*]] = phi i32 [ %[[LOOP_EXIT_PHI1]], %[[LOOP_EXIT2_SPLIT]] ], [ %[[LCSSA_A]], %loop.exit2.split.us ] +; CHECK-NEXT: %[[LOOP_EXIT_PHI2:.*]] = phi i32 [ %[[LOOP_EXIT_PHI1]], %[[LOOP_EXIT2_SPLIT]] ], [ %[[LCSSA_A]], %[[LOOP_EXIT_A]] ] ; CHECK-NEXT: br label %exit exit: @@ -4037,7 +4058,9 @@ entry: ; CHECK-NEXT: ] ; ; CHECK: [[ENTRY_SPLIT_US]]: -; CHECK-NEXT: br label %outer.header.us +; CHECK-NEXT: switch i32 %arg, label %[[ENTRY_SPLIT_US_SPLIT:.*]] [ +; CHECK-NEXT: i32 1, label %[[ENTRY_SPLIT_US_SPLIT_US:.*]] +; CHECK-NEXT: ] outer.header: br label %inner.header @@ -4051,13 +4074,66 @@ inner.header: inner.body1: %a = call i32 @a() br label %inner.latch +; The (super convoluted) fully unswitched loop around `@a`. +; +; CHECK: [[ENTRY_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[OUTER_HEADER_US_US:.*]] +; +; CHECK: [[OUTER_HEADER_US_US]]: +; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_US_US:.*]] +; +; CHECK: [[OUTER_LATCH_US_US:.*]]: +; CHECK-NEXT: %[[OUTER_COND_US_US:.*]] = call i1 @cond() +; CHECK-NEXT: br i1 %[[OUTER_COND_US_US]], label %[[OUTER_HEADER_US_US]], label %[[EXIT_SPLIT_US_SPLIT_US:.*]] +; +; CHECK: [[OUTER_HEADER_SPLIT_US_US]]: +; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_SPLIT_US_US_US:.*]] +; +; CHECK: [[INNER_LOOPEXIT2_US_US:.*]]: +; CHECK-NEXT: br label %[[OUTER_LATCH_US_US]] +; +; CHECK: [[OUTER_HEADER_SPLIT_SPLIT_US_US_US]]: +; CHECK-NEXT: br label %[[INNER_HEADER_US_US_US:.*]] +; +; CHECK: [[INNER_HEADER_US_US_US]]: +; CHECK-NEXT: br label %[[INNER_BODY1_US_US_US:.*]] +; +; CHECK: [[INNER_BODY1_US_US_US]]: +; CHECK-NEXT: %[[A:.*]] = call i32 @a() +; CHECK-NEXT: br label %[[INNER_LATCH_US_US_US:.*]] +; +; CHECK: [[INNER_LATCH_US_US_US]]: +; CHECK-NEXT: %[[PHI_A:.*]] = phi i32 [ %[[A]], %[[INNER_BODY1_US_US_US]] ] +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 0) +; CHECK-NEXT: call void @sink1(i32 %[[PHI_A]]) +; CHECK-NEXT: %[[INNER_COND_US_US_US:.*]] = call i1 @cond() +; CHECK-NEXT: br i1 %[[INNER_COND_US_US_US]], label %[[INNER_HEADER_US_US_US]], label %[[INNER_LOOPEXIT2_SPLIT_US_US_US:.*]] +; +; CHECK: [[INNER_LOOPEXIT2_SPLIT_US_US_US]]: +; CHECK-NEXT: br label %[[INNER_LOOPEXIT2_US_US]] +; +; CHECK: [[EXIT_SPLIT_US_SPLIT_US]]: +; CHECK-NEXT: br label %[[EXIT_SPLIT_US:.*]] + inner.body2: %b = call i32 @b() br label %inner.latch ; The fully unswitched loop around `@b`. ; -; CHECK: outer.header.us: +; CHECK: [[ENTRY_SPLIT_US_SPLIT]]: +; CHECK-NEXT: br label %[[OUTER_HEADER_US:.*]] +; +; CHECK: [[OUTER_HEADER_US]]: ; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_US:.*]] ; ; CHECK: [[INNER_HEADER_US:.*]]: @@ -4087,51 +4163,18 @@ inner.body2: ; ; CHECK: [[OUTER_LATCH_US:.*]]: ; CHECK-NEXT: %[[OUTER_COND_US:.*]] = call i1 @cond() -; CHECK-NEXT: br i1 %[[OUTER_COND_US]], label %outer.header.us, label %exit.split.us, !llvm.loop !33 +; CHECK-NEXT: br i1 %[[OUTER_COND_US]], label %[[OUTER_HEADER_US]], label %[[EXIT_SPLIT_US_SPLIT:.*]] ; ; CHECK: [[OUTER_HEADER_SPLIT_US]]: -; CHECK-NEXT: switch i32 %arg, label %outer.header.split.split.us5 [ -; CHECK-NEXT: i32 1, label %outer.header.split.split.us.us -; CHECK-NEXT: ] +; CHECK-NEXT: br label %[[OUTER_HEADER_SPLIT_SPLIT_US:.*]] ; -; CHECK: outer.header.split.split.us5: +; CHECK: [[OUTER_HEADER_SPLIT_SPLIT_US]]: ; CHECK-NEXT: br label %[[INNER_HEADER_US]] ; ; CHECK: [[INNER_LOOPEXIT2_US]]: ; CHECK-NEXT: br label %[[OUTER_LATCH_US]] - -; The (super convoluted) fully unswitched loop around `@a`. ; -; CHECK: outer.header.split.split.us.us: -; CHECK-NEXT: br label %[[INNER_HEADER_US_US:.*]] -; -; CHECK: [[INNER_HEADER_US_US]]: -; CHECK-NEXT: br label %[[INNER_BODY1_US_US:.*]] -; -; CHECK: [[INNER_BODY1_US_US]]: -; CHECK-NEXT: %[[A:.*]] = call i32 @a() -; CHECK-NEXT: br label %[[INNER_LATCH_US_US:.*]] -; -; CHECK: [[INNER_LATCH_US_US]]: -; CHECK-NEXT: %[[PHI_A:.*]] = phi i32 [ %[[A]], %[[INNER_BODY1_US_US]] ] -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 0) -; CHECK-NEXT: call void @sink1(i32 %[[PHI_A]]) -; CHECK-NEXT: %[[INNER_COND_US_US:.*]] = call i1 @cond() -; CHECK-NEXT: br i1 %[[INNER_COND_US_US]], label %[[INNER_HEADER_US_US]], label %[[INNER_LOOPEXIT2_SPLIT_US_US:.*]], !llvm.loop !34 -; -; CHECK: [[INNER_LOOPEXIT2_SPLIT_US_US]]: -; CHECK-NEXT: br label %[[INNER_LOOPEXIT2_US]] -; -; CHECK: exit.split.us: +; CHECK: [[EXIT_SPLIT_US]]: ; CHECK-NEXT: br label %exit inner.latch: diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll index e821dfcd0124..a169aa47ea7d 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch-loop-and-block-dispositions.ll @@ -11,43 +11,59 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) { ; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i16 [[A:%.*]], -6 ; CHECK-NEXT: br i1 [[TMP0]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] ; CHECK: entry.split.us: +; CHECK-NEXT: br i1 [[C_1:%.*]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_1_HEADER_US_US:%.*]] +; CHECK: loop.1.header.us.us: +; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_US_US:%.*]] +; CHECK: loop.1.header.split.us.us.us: +; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: loop.1.header.split.us.split.us.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: entry.split.us.split: ; CHECK-NEXT: br label [[LOOP_1_HEADER_US:%.*]] ; CHECK: loop.1.header.us: ; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_US:%.*]] -; CHECK: loop.4.header.us2: +; CHECK: loop.4.header.us5: ; CHECK-NEXT: br label [[LOOP_5_US6:%.*]] -; CHECK: loop.5.us3: +; CHECK: loop.5.us6: ; CHECK-NEXT: [[IV_US7:%.*]] = phi i16 [ 0, [[LOOP_4_HEADER_US5:%.*]] ], [ [[IV_NEXT_US9:%.*]], [[LOOP_5_US6]] ] ; CHECK-NEXT: [[GEP_US8:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i16 [[IV_US7]] ; CHECK-NEXT: store ptr null, ptr [[GEP_US8]], align 8 ; CHECK-NEXT: [[IV_NEXT_US9]] = add nuw nsw i16 [[IV_US7]], 1 ; CHECK-NEXT: [[EC_US10:%.*]] = icmp ne i16 [[IV_US7]], 10000 -; CHECK-NEXT: br i1 [[EC_US10]], label [[LOOP_5_US6]], label [[LOOP_4_LATCH_US8:%.*]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: loop.4.latch.us8: +; CHECK-NEXT: br i1 [[EC_US10]], label [[LOOP_5_US6]], label [[LOOP_4_LATCH_US11:%.*]] +; CHECK: loop.4.latch.us11: ; CHECK-NEXT: br label [[LOOP_1_LATCH_US:%.*]] ; CHECK: loop.1.latch.us: -; CHECK-NEXT: br label [[LOOP_1_HEADER_US]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_1_HEADER_US]] ; CHECK: loop.4.header.preheader.us: -; CHECK-NEXT: br i1 [[C_1:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT1_US_SPLIT_US:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT1_US9:%.*]] +; CHECK-NEXT: br i1 false, label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US_SPLIT_US:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US15:%.*]] ; CHECK: loop.1.header.split.us.us: ; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US14:%.*]] -; CHECK: loop.2.header.us.us: +; CHECK: loop.2.header.us.us12: ; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_US_US13:%.*]] ; CHECK: loop.2.latch.us.us: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US14]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US_US:%.*]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: loop.2.header.split.us.us.us: +; CHECK-NEXT: br i1 false, label [[LOOP_2_HEADER_US_US12:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US_US:%.*]] +; CHECK: loop.2.header.split.us.us.us13: +; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US3_US:%.*]] +; CHECK: loop.3.header.us.us1.us: ; CHECK-NEXT: br label [[LOOP_3_LATCH_US_US2_US:%.*]] -; CHECK: loop.3.header.us.us.us: +; CHECK: loop.3.latch.us.us2.us: ; CHECK-NEXT: br label [[LOOP_2_LATCH_SPLIT_US_US_US:%.*]] -; CHECK: loop.3.latch.us.us.us: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_LATCH_US_US2_US]], label [[LOOP_2_LATCH_SPLIT_US_US_US1:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: loop.2.latch.split.us.us.us: +; CHECK-NEXT: br label [[LOOP_2_LATCH_US_US:%.*]] +; CHECK: loop.2.header.split.us.split.us3.us: ; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US1_US:%.*]] ; CHECK: loop.4.header.preheader.split.us.us: -; CHECK-NEXT: br label [[LOOP_2_HEADER_US_US12:%.*]] -; CHECK: loop.4.header.preheader.split1.us9: +; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER_US:%.*]] +; CHECK: loop.1.header.split.us.split.us14: +; CHECK-NEXT: br label [[LOOP_2_HEADER_US_US12]] +; CHECK: loop.4.header.preheader.split4.us15: ; CHECK-NEXT: br label [[LOOP_4_HEADER_US5]] -; CHECK: loop.4.header.preheader.split1.us.split.us: +; CHECK: loop.4.header.preheader.split4.us.split.us: +; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US:%.*]] +; CHECK: loop.1.header.split.us.split.us.split.us: ; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US:%.*]] ; CHECK: entry.split: ; CHECK-NEXT: br label [[LOOP_1_HEADER:%.*]] @@ -55,20 +71,36 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) { ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i16 [[A]], -6 ; CHECK-NEXT: br i1 [[TMP1]], label [[LOOP_1_HEADER_SPLIT_US:%.*]], label [[LOOP_1_HEADER_SPLIT:%.*]] ; CHECK: loop.1.header.split.us: +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US_SPLIT:%.*]], label [[LOOP_1_HEADER_SPLIT_US_SPLIT:%.*]] +; CHECK: loop.1.header.split.us.split.us.split: +; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US]] +; CHECK: loop.1.header.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_2_HEADER_US_US:%.*]] +; CHECK: loop.2.header.us.us: +; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_US_US:%.*]] +; CHECK: loop.2.header.split.us.us.us: +; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: loop.2.header.split.us.split.us.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: loop.1.header.split.us.split: ; CHECK-NEXT: br label [[LOOP_2_HEADER_US:%.*]] ; CHECK: loop.2.header.us: ; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_US:%.*]] ; CHECK: loop.2.latch.us: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER_US]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US:%.*]], !llvm.loop [[LOOP3]] +; CHECK-NEXT: br i1 false, label [[LOOP_2_HEADER_US]], label [[LOOP_4_HEADER_PREHEADER_SPLIT_US:%.*]] ; CHECK: loop.2.header.split.us.us: +; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US3:%.*]] +; CHECK: loop.3.header.us.us1: ; CHECK-NEXT: br label [[LOOP_3_LATCH_US_US2:%.*]] -; CHECK: loop.3.header.us.us: +; CHECK: loop.3.latch.us.us2: ; CHECK-NEXT: br label [[LOOP_2_LATCH_SPLIT_US_US:%.*]] -; CHECK: loop.3.latch.us.us: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_LATCH_US_US2]], label [[LOOP_2_LATCH_SPLIT_US_US1:%.*]], !llvm.loop [[LOOP4]] ; CHECK: loop.2.latch.split.us.us: +; CHECK-NEXT: br label [[LOOP_2_LATCH_US:%.*]] +; CHECK: loop.2.header.split.us.split.us3: ; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US1:%.*]] ; CHECK: loop.4.header.preheader.split.us: +; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER:%.*]] +; CHECK: loop.2.header.split.us.split.us.split.us: ; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US:%.*]] ; CHECK: loop.1.header.split: ; CHECK-NEXT: br label [[LOOP_2_HEADER:%.*]] @@ -76,11 +108,21 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) { ; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i16 [[A]], -6 ; CHECK-NEXT: br i1 [[TMP2]], label [[LOOP_2_HEADER_SPLIT_US:%.*]], label [[LOOP_2_HEADER_SPLIT:%.*]] ; CHECK: loop.2.header.split.us: +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US_SPLIT:%.*]], label [[LOOP_2_HEADER_SPLIT_US_SPLIT:%.*]] +; CHECK: loop.2.header.split.us.split.us.split: +; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US]] +; CHECK: loop.2.header.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US:%.*]] +; CHECK: loop.3.header.us.us: +; CHECK-NEXT: br label [[LOOP_3_LATCH_US_US:%.*]] +; CHECK: loop.3.latch.us.us: +; CHECK-NEXT: br label [[LOOP_3_HEADER_US_US]] +; CHECK: loop.2.header.split.us.split: ; CHECK-NEXT: br label [[LOOP_3_HEADER_US:%.*]] ; CHECK: loop.3.header.us: ; CHECK-NEXT: br label [[LOOP_3_LATCH_US:%.*]] ; CHECK: loop.3.latch.us: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_HEADER_US]], label [[LOOP_2_LATCH_SPLIT_US:%.*]], !llvm.loop [[LOOP4]] +; CHECK-NEXT: br label [[LOOP_2_LATCH_SPLIT_US:%.*]] ; CHECK: loop.2.latch.split.us: ; CHECK-NEXT: br label [[LOOP_2_LATCH:%.*]] ; CHECK: loop.2.header.split: @@ -92,18 +134,18 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) { ; CHECK-NEXT: call void @clobber() ; CHECK-NEXT: br label [[LOOP_3_LATCH]] ; CHECK: loop.3.latch: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_HEADER]], label [[LOOP_2_LATCH_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_3_HEADER]], label [[LOOP_2_LATCH_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: loop.2.latch.split: ; CHECK-NEXT: br label [[LOOP_2_LATCH]] ; CHECK: loop.2.latch: -; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER]], label [[LOOP_4_HEADER_PREHEADER_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_2_HEADER]], label [[LOOP_4_HEADER_PREHEADER_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loop.4.header.preheader.split: -; CHECK-NEXT: br label [[LOOP_2_HEADER_SPLIT_US_SPLIT_US]] +; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER]] ; CHECK: loop.4.header.preheader: ; CHECK-NEXT: br i1 [[C_1]], label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US_SPLIT:%.*]], label [[LOOP_4_HEADER_PREHEADER_SPLIT4:%.*]] -; CHECK: loop.4.header.preheader.split1.us.split: -; CHECK-NEXT: br label [[LOOP_1_HEADER_SPLIT_US_SPLIT_US]] -; CHECK: loop.4.header.preheader.split1.us: +; CHECK: loop.4.header.preheader.split4.us.split: +; CHECK-NEXT: br label [[LOOP_4_HEADER_PREHEADER_SPLIT4_US]] +; CHECK: loop.4.header.preheader.split4.us: ; CHECK-NEXT: br label [[LOOP_4_HEADER_US:%.*]] ; CHECK: loop.4.header.us: ; CHECK-NEXT: br label [[LOOP_5_US:%.*]] @@ -116,7 +158,7 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) { ; CHECK-NEXT: br i1 [[EC_US]], label [[LOOP_5_US]], label [[LOOP_4_LATCH_US:%.*]] ; CHECK: loop.4.latch.us: ; CHECK-NEXT: br label [[LOOP_4_HEADER_US]] -; CHECK: loop.4.header.preheader.split1: +; CHECK: loop.4.header.preheader.split4: ; CHECK-NEXT: br label [[LOOP_4_HEADER:%.*]] ; CHECK: loop.4.header: ; CHECK-NEXT: br label [[LOOP_5:%.*]] @@ -126,11 +168,11 @@ define void @test_pr58564(i16 %a, i1 %c.1, ptr %dst) { ; CHECK-NEXT: store ptr null, ptr [[GEP]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i16 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp ne i16 [[IV]], 10000 -; CHECK-NEXT: br i1 [[EC]], label [[LOOP_5]], label [[LOOP_4_LATCH:%.*]], !llvm.loop [[LOOP0]] +; CHECK-NEXT: br i1 [[EC]], label [[LOOP_5]], label [[LOOP_4_LATCH:%.*]] ; CHECK: loop.4.latch: ; CHECK-NEXT: br label [[LOOP_1_LATCH:%.*]] ; CHECK: loop.1.latch: -; CHECK-NEXT: br label [[LOOP_1_HEADER]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br label [[LOOP_1_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: br label %loop.1.header diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll index 108b2406920f..1d8942079ffd 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/partial-unswitch.ll @@ -19,7 +19,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -37,7 +37,7 @@ define i32 @partial_unswitch_true_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -84,7 +84,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -102,7 +102,7 @@ define i32 @partial_unswitch_false_successor(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -151,7 +151,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -171,7 +171,7 @@ define i32 @partial_unswtich_gep_load_icmp(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -223,7 +223,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[RED_NEXT_US]] = phi i32 [ [[ADD_10_US]], [[NOCLOBBER_US]] ] ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: [[RED_NEXT_LCSSA_US:%.*]] = phi i32 [ [[RED_NEXT_US]], [[LOOP_LATCH_US]] ] ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -246,7 +246,7 @@ define i32 @partial_unswitch_reduction_phi(ptr %ptr, i32 %N) { ; CHECK-NEXT: [[RED_NEXT]] = phi i32 [ [[ADD_5]], [[CLOBBER]] ], [ [[ADD_10]], [[NOCLOBBER]] ] ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[LOOP_LATCH]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -305,7 +305,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -325,7 +325,7 @@ define i32 @partial_unswitch_true_successor_noclobber(ptr noalias %ptr.1, ptr no ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -619,7 +619,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_LOOPEXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_LOOPEXIT_SPLIT_US:%.*]] ; CHECK: exit.loopexit.split.us: ; CHECK-NEXT: br label [[EXIT_LOOPEXIT:%.*]] ; CHECK: loop.ph.split: @@ -637,7 +637,7 @@ define i32 @partial_unswitch_true_successor_preheader_insertion(ptr %ptr, i32 %N ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT_SPLIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: exit.loopexit.split: ; CHECK-NEXT: br label [[EXIT_LOOPEXIT]] ; CHECK: exit.loopexit: @@ -695,7 +695,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -713,7 +713,7 @@ define i32 @partial_unswitch_true_successor_insert_point(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -765,7 +765,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -784,7 +784,7 @@ define i32 @partial_unswitch_true_successor_hoist_invariant(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1057,7 +1057,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -1073,7 +1073,7 @@ define i32 @partial_unswitch_true_to_latch(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1112,11 +1112,19 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 41 ; CHECK-NEXT: br i1 [[TMP3]], label [[ENTRY_SPLIT:%.*]], label [[ENTRY_SPLIT_US:%.*]] ; CHECK: entry.split.us: +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label [[LOOP_US_US:%.*]] +; CHECK: loop.us.us: +; CHECK-NEXT: br label [[EXITING_US_US:%.*]] +; CHECK: exiting.us.us: +; CHECK-NEXT: br label [[LOOP_US_US]] +; CHECK: entry.split.us.split: ; CHECK-NEXT: br label [[LOOP_US:%.*]] ; CHECK: loop.us: ; CHECK-NEXT: br label [[EXITING_US:%.*]] ; CHECK: exiting.us: -; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: br label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: [[RET_VAL_US:%.*]] = phi i32 [ 1, [[EXITING_US]] ] ; CHECK-NEXT: br label [[EXIT:%.*]] @@ -1130,7 +1138,7 @@ define i32 @partial_unswitch_exiting_block_with_multiple_unswitch_candidates(i32 ; CHECK-NEXT: store i32 [[TMP1:%.*]], ptr [[PTR]], align 16 ; CHECK-NEXT: br label [[EXITING]] ; CHECK: exiting: -; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]] +; CHECK-NEXT: br i1 [[EXIT_COND]], label [[LOOP]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: [[RET_VAL:%.*]] = phi i32 [ 1, [[EXITING]] ] ; CHECK-NEXT: br label [[EXIT]] @@ -1177,7 +1185,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 % ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -1241,7 +1249,7 @@ define i32 @partial_unswitch_true_successor_for_cost_calculation(ptr %ptr, i32 % ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP23:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1334,7 +1342,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -1352,7 +1360,7 @@ define i32 @partial_unswitch_true_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1399,7 +1407,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch.us: ; CHECK-NEXT: [[C_US:%.*]] = icmp ult i32 [[IV_US]], [[N:%.*]] ; CHECK-NEXT: [[IV_NEXT_US]] = add i32 [[IV_US]], 1 -; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: br i1 [[C_US]], label [[LOOP_HEADER_US]], label [[EXIT_SPLIT_US:%.*]] ; CHECK: exit.split.us: ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: entry.split: @@ -1417,7 +1425,7 @@ define i32 @partial_unswitch_false_successor_trunc(ptr %ptr, i32 %N) { ; CHECK: loop.latch: ; CHECK-NEXT: [[C:%.*]] = icmp ult i32 [[IV]], [[N]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP27:![0-9]+]] +; CHECK-NEXT: br i1 [[C]], label [[LOOP_HEADER]], label [[EXIT_SPLIT:%.*]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit.split: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -1448,15 +1456,15 @@ exit: ret i32 10 } -; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]} +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[UNSWITCH_PARTIAL_DISABLE:![0-9]+]]} ; CHECK: [[UNSWITCH_PARTIAL_DISABLE]] = !{!"llvm.loop.unswitch.partial.disable"} +; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[UNSWITCH_PARTIAL_DISABLE]]} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[UNSWITCH_PARTIAL_DISABLE]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[UNSWITCH_PARTIAL_DISABLE]]} ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[UNSWITCH_PARTIAL_DISABLE]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[UNSWITCH_PARTIAL_DISABLE]]} ; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[UNSWITCH_PARTIAL_DISABLE]]} +; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[UNSWITCH_PARTIAL_DISABLE]]} ; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[UNSWITCH_PARTIAL_DISABLE]]} +; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[UNSWITCH_PARTIAL_DISABLE]]} ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP19]] = distinct !{[[LOOP19]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP21]] = distinct !{[[LOOP21]], [[UNSWITCH_PARTIAL_DISABLE]]} -; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[UNSWITCH_PARTIAL_DISABLE]]} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll b/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll deleted file mode 100644 index e24d17f08842..000000000000 --- a/llvm/test/Transforms/SimpleLoopUnswitch/pr138509.ll +++ /dev/null @@ -1,49 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -S -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch)" < %s | FileCheck %s - -@a = global i32 0, align 4 -@b = global i32 0, align 4 -@c = global i32 0, align 4 -@d = global i32 0, align 4 - -define i32 @main() { -entry: - br label %outer.loop.header - -outer.loop.header: ; preds = %outer.loop.latch, %entry - br i1 false, label %exit, label %outer.loop.body - -outer.loop.body: ; preds = %inner.loop.header, %outer.loop.header - store i32 1, ptr @c, align 4 - %cmp = icmp sgt i32 0, -1 - br i1 %cmp, label %outer.loop.latch, label %exit - -inner.loop.header: ; preds = %outer.loop.latch, %inner.loop.body - %a_val = load i32, ptr @a, align 4 - %c_val = load i32, ptr @c, align 4 - %mul = mul nsw i32 %c_val, %a_val - store i32 %mul, ptr @b, align 4 - %cmp2 = icmp sgt i32 %mul, -1 - br i1 %cmp2, label %inner.loop.body, label %outer.loop.body - -inner.loop.body: ; preds = %inner.loop.header - %mul2 = mul nsw i32 %c_val, 3 - store i32 %mul2, ptr @c, align 4 - store i32 %c_val, ptr @d, align 4 - %mul3 = mul nsw i32 %c_val, %a_val - %cmp3 = icmp sgt i32 %mul3, -1 - br i1 %cmp3, label %inner.loop.header, label %exit - -outer.loop.latch: ; preds = %outer.loop.body - %d_val = load i32, ptr @d, align 4 - store i32 %d_val, ptr @b, align 4 - %cmp4 = icmp eq i32 %d_val, 0 - br i1 %cmp4, label %inner.loop.header, label %outer.loop.header - -exit: ; preds = %inner.loop.body, %outer.loop.body, %outer.loop.header - ret i32 0 -} - -; CHECK: [[LOOP0:.*]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.unswitch.nontrivial.disable"} -; CHECK: [[LOOP2:.*]] = distinct !{[[LOOP2]], [[META1]]} diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll b/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll index 4e428cbc30bb..ef00d7ea8f2b 100644 --- a/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll +++ b/llvm/test/Transforms/SimpleLoopUnswitch/update-scev-3.ll @@ -19,42 +19,56 @@ define i32 @foo(i1 %not) { ; CHECK-NEXT: [[FALSE:%.*]] = and i1 true, false ; CHECK-NEXT: br i1 [[NOT]], label [[ENTRY_SPLIT_US:%.*]], label [[ENTRY_SPLIT:%.*]] ; CHECK: entry.split.us: +; CHECK-NEXT: br i1 [[FALSE]], label [[ENTRY_SPLIT_US_SPLIT_US:%.*]], label [[ENTRY_SPLIT_US_SPLIT:%.*]] +; CHECK: entry.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND_US_US:%.*]] +; CHECK: for.cond.us.us: +; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_US_US:%.*]] +; CHECK: for.cond.split.us.us.us: +; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: for.cond.split.us.split.us.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: entry.split.us.split: ; CHECK-NEXT: br label [[FOR_COND_US:%.*]] ; CHECK: for.cond.us: ; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_US:%.*]] ; CHECK: for.inc11.us: -; CHECK-NEXT: br label [[FOR_COND_US]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND_US]] ; CHECK: for.cond.split.us.us: -; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US:%.*]] -; CHECK: for.cond5.preheader.us.us: -; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_US_US:%.*]] +; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US11:%.*]] +; CHECK: for.cond5.preheader.us.us9: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_US_US10:%.*]] ; CHECK: for.inc8.us.us: -; CHECK-NEXT: br i1 [[FALSE]], label [[FOR_INC8_FOR_COND5_PREHEADER_CRIT_EDGE_US_US:%.*]], label [[FOR_INC11_SPLIT_US_US:%.*]] +; CHECK-NEXT: br i1 false, label [[FOR_INC8_FOR_COND5_PREHEADER_CRIT_EDGE_US_US:%.*]], label [[FOR_INC11_SPLIT_US_US:%.*]] ; CHECK: for.inc8.for.cond5.preheader_crit_edge.us.us: -; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US9:%.*]] ; CHECK: for.end.us.us: -; CHECK-NEXT: br i1 [[FALSE]], label [[FOR_INC8_US_US:%.*]], label [[CLEANUP15_SPLIT_US_SPLIT_US:%.*]] -; CHECK: for.cond5.preheader.split.us.us.us: -; CHECK-NEXT: br label [[FOR_BODY7_US_US_US:%.*]] -; CHECK: for.body7.us.us.us: -; CHECK-NEXT: br label [[HANDLER_POINTER_OVERFLOW_US_US_US:%.*]] -; CHECK: handler.pointer_overflow.us.us.us: -; CHECK-NEXT: br label [[CONT_US_US_US:%.*]] -; CHECK: cont.us.us.us: -; CHECK-NEXT: br i1 [[FALSE]], label [[CONT_FOR_BODY7_CRIT_EDGE_US_US_US:%.*]], label [[FOR_END_SPLIT_US_US_US:%.*]] -; CHECK: cont.for.body7_crit_edge.us.us.us: -; CHECK-NEXT: br label [[FOR_BODY7_US_US_US]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 false, label [[FOR_INC8_US_US:%.*]], label [[CLEANUP15_SPLIT_US_SPLIT_US:%.*]] +; CHECK: for.cond5.preheader.split.us.us.us10: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US7_US:%.*]] +; CHECK: for.body7.us.us4.us: +; CHECK-NEXT: br label [[HANDLER_POINTER_OVERFLOW_US_US5_US:%.*]] +; CHECK: handler.pointer_overflow.us.us5.us: +; CHECK-NEXT: br label [[CONT_US_US6_US:%.*]] +; CHECK: cont.us.us6.us: +; CHECK-NEXT: br label [[FOR_END_SPLIT_US_US_US:%.*]] ; CHECK: for.end.split.us.us.us: ; CHECK-NEXT: br label [[FOR_END_US_US:%.*]] +; CHECK: for.cond5.preheader.split.us.split.us7.us: +; CHECK-NEXT: br label [[FOR_BODY7_US_US4_US:%.*]] ; CHECK: for.inc11.split.us.us: ; CHECK-NEXT: br label [[FOR_INC11_US:%.*]] +; CHECK: for.cond.split.us.split.us11: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US9]] +; CHECK: for.cond.split.us.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND_SPLIT_US_SPLIT_US:%.*]] ; CHECK: cleanup15.split.us.split.us: ; CHECK-NEXT: br label [[CLEANUP15_SPLIT_US:%.*]] ; CHECK: entry.split: ; CHECK-NEXT: br i1 [[FALSE]], label [[ENTRY_SPLIT_SPLIT_US:%.*]], label [[ENTRY_SPLIT_SPLIT:%.*]] ; CHECK: entry.split.split.us: -; CHECK-NEXT: br label [[FOR_COND_US5:%.*]] -; CHECK: for.cond.us5: +; CHECK-NEXT: br label [[FOR_COND_US12:%.*]] +; CHECK: for.cond.us12: ; CHECK-NEXT: br label [[FOR_COND_SPLIT_US:%.*]] ; CHECK: for.cond.split.us: ; CHECK-NEXT: br label [[FOR_COND_SPLIT_SPLIT_US_SPLIT_US:%.*]] @@ -64,13 +78,23 @@ define i32 @foo(i1 %not) { ; CHECK-NEXT: br label [[FOR_COND:%.*]] ; CHECK: for.cond: ; CHECK-NEXT: br label [[FOR_COND_SPLIT:%.*]] +; CHECK: for.cond.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US_US:%.*]] +; CHECK: for.cond5.preheader.us.us: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_US_US:%.*]] +; CHECK: for.cond5.preheader.split.us.us.us: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] +; CHECK: for.cond5.preheader.split.us.split.us.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US_SPLIT_US:%.*]] ; CHECK: cleanup15.split.us: ; CHECK-NEXT: br label [[CLEANUP15:%.*]] +; CHECK: for.cond5.preheader.split.us.split.us.split.us: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US_SPLIT_US:%.*]] ; CHECK: for.cond.split: ; CHECK-NEXT: br label [[FOR_COND_SPLIT_SPLIT:%.*]] ; CHECK: for.cond.split.split.us: -; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US4:%.*]] -; CHECK: for.cond5.preheader.us4: +; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_US8:%.*]] +; CHECK: for.cond5.preheader.us8: ; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_US:%.*]] ; CHECK: for.cond5.preheader.split.us: ; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_SPLIT_US_SPLIT_US:%.*]] @@ -80,6 +104,16 @@ define i32 @foo(i1 %not) { ; CHECK-NEXT: br label [[FOR_COND5_PREHEADER:%.*]] ; CHECK: for.cond5.preheader: ; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT:%.*]] +; CHECK: for.cond5.preheader.split.us.split.us: +; CHECK-NEXT: br label [[FOR_BODY7_US_US:%.*]] +; CHECK: for.body7.us.us: +; CHECK-NEXT: br label [[HANDLER_POINTER_OVERFLOW_US_US:%.*]] +; CHECK: handler.pointer_overflow.us.us: +; CHECK-NEXT: br label [[CONT_US_US:%.*]] +; CHECK: cont.us.us: +; CHECK-NEXT: br label [[CONT_FOR_BODY7_CRIT_EDGE_US_US:%.*]] +; CHECK: cont.for.body7_crit_edge.us.us: +; CHECK-NEXT: br label [[FOR_BODY7_US_US]] ; CHECK: for.cond5.preheader.split: ; CHECK-NEXT: br label [[FOR_COND5_PREHEADER_SPLIT_SPLIT:%.*]] ; CHECK: for.cond5.preheader.split.split.us: diff --git a/llvm/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll b/llvm/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll index 2e9e7b19c73e..44d92e1a1c21 100644 --- a/llvm/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll +++ b/llvm/test/Transforms/SimplifyCFG/2008-07-13-InfLoopMiscompile.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s +; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -keep-loops="false" -S | FileCheck %s ; PR2540 ; Outval should end up with a select from 0/2, not all constants. @@ -52,4 +52,3 @@ func_1.exit: ; preds = %cowblock, %entry } declare i32 @printf(ptr, ...) nounwind - diff --git a/llvm/test/Transforms/SimplifyCFG/2025-07-29-non-canoncial-loop.ll b/llvm/test/Transforms/SimplifyCFG/2025-07-29-non-canoncial-loop.ll new file mode 100644 index 000000000000..322dd98f48df --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/2025-07-29-non-canoncial-loop.ll @@ -0,0 +1,62 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 --keep-loops="true" -S | FileCheck --check-prefix=NO-THREADING %s +; Checks that we do not thread the control flow through the loop header loop_header as +; that will introduce a non-canonical loop. + +; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 --keep-loops="false" -S | FileCheck --check-prefix=THREADING %s +; Checks that we thread the control flow through the loop header loop_header since we +; do not request --keep-loops. + +define void @__start(i1 %cond) { +; NO-THREADING-LABEL: define void @__start( +; NO-THREADING-SAME: i1 [[COND:%.*]]) { +; NO-THREADING-NEXT: [[ENTRY:.*:]] +; NO-THREADING-NEXT: br label %[[LOOP_HEADER:.*]] +; NO-THREADING: [[LOOP_HEADER]]: +; NO-THREADING-NEXT: br i1 [[COND]], label %[[LOOP_BODY_1:.*]], label %[[LOOP_BODY_0:.*]] +; NO-THREADING: [[LOOP_BODY_0]]: +; NO-THREADING-NEXT: [[_0_:%.*]] = add i16 0, 0 +; NO-THREADING-NEXT: br label %[[LOOP_EXIT:.*]] +; NO-THREADING: [[LOOP_BODY_1]]: +; NO-THREADING-NEXT: [[_1_:%.*]] = add i32 0, 1 +; NO-THREADING-NEXT: br label %[[LOOP_EXIT]] +; NO-THREADING: [[LOOP_EXIT]]: +; NO-THREADING-NEXT: br i1 [[COND]], label %[[LOOP_HEADER]], label %[[EXIT:.*]] +; NO-THREADING: [[EXIT]]: +; NO-THREADING-NEXT: ret void +; +; THREADING-LABEL: define void @__start( +; THREADING-SAME: i1 [[COND:%.*]]) { +; THREADING-NEXT: [[ENTRY:.*:]] +; THREADING-NEXT: br i1 [[COND]], label %[[LOOP_BODY_1:.*]], label %[[LOOP_BODY_0:.*]] +; THREADING: [[LOOP_BODY_0]]: +; THREADING-NEXT: [[_0_:%.*]] = add i16 0, 0 +; THREADING-NEXT: br label %[[LOOP_EXIT:.*]] +; THREADING: [[LOOP_BODY_1]]: +; THREADING-NEXT: [[_1_:%.*]] = add i32 0, 1 +; THREADING-NEXT: br label %[[LOOP_EXIT]] +; THREADING: [[LOOP_EXIT]]: +; THREADING-NEXT: br i1 [[COND]], label %[[LOOP_BODY_1]], label %[[EXIT:.*]] +; THREADING: [[EXIT]]: +; THREADING-NEXT: ret void +; +entry: + br label %loop_header + +loop_header: ; preds = %loop_exit, %entry + br i1 %cond, label %loop_body_1, label %loop_body_0 + +loop_body_0: ; preds = %loop_header + %_0_ = add i16 0, 0 + br label %loop_exit + +loop_body_1: ; preds = %loop_header + %_1_ = add i32 0, 1 + br label %loop_exit + +loop_exit: ; preds = %loop_body_1, %loop_body_0 + br i1 %cond, label %loop_header, label %exit + +exit: ; preds = %loop_exit + ret void +} diff --git a/llvm/test/Transforms/SimplifyCFG/branch-phi-thread.ll b/llvm/test/Transforms/SimplifyCFG/branch-phi-thread.ll index 0afec05ecbd6..ec9423bd8167 100644 --- a/llvm/test/Transforms/SimplifyCFG/branch-phi-thread.ll +++ b/llvm/test/Transforms/SimplifyCFG/branch-phi-thread.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=simplifycfg,adce -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s +; RUN: opt < %s -passes=simplifycfg,adce -simplifycfg-require-and-preserve-domtree=1 -keep-loops="false" -S | FileCheck %s declare void @f1() diff --git a/llvm/test/Transforms/SimplifyCFG/jump-threading.ll b/llvm/test/Transforms/SimplifyCFG/jump-threading.ll index 50a32413a055..a4073ae6eb0b 100644 --- a/llvm/test/Transforms/SimplifyCFG/jump-threading.ll +++ b/llvm/test/Transforms/SimplifyCFG/jump-threading.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=simplifycfg < %s | FileCheck %s +; RUN: opt -S -passes=simplifycfg -keep-loops="false" < %s | FileCheck %s declare void @foo() declare void @bar() diff --git a/llvm/test/Transforms/SimplifyCFG/switch_create.ll b/llvm/test/Transforms/SimplifyCFG/switch_create.ll index f446d718f820..a1533bdcffb4 100644 --- a/llvm/test/Transforms/SimplifyCFG/switch_create.ll +++ b/llvm/test/Transforms/SimplifyCFG/switch_create.ll @@ -1068,3 +1068,60 @@ if: else: ret void } + +define void @trunc_nuw_i1_condition(i32 %V) { +; CHECK-LABEL: @trunc_nuw_i1_condition( +; CHECK-NEXT: switch i32 [[V:%.*]], label [[F:%.*]] [ +; CHECK-NEXT: i32 2, label [[T:%.*]] +; CHECK-NEXT: i32 0, label [[T]] +; CHECK-NEXT: ] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: T: +; CHECK-NEXT: call void @foo1() +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: F: +; CHECK-NEXT: call void @foo2() +; CHECK-NEXT: br label [[COMMON_RET]] +; + %C1 = icmp eq i32 %V, 2 + br i1 %C1, label %T, label %N +N: + %C2 = trunc nuw i32 %V to i1 + br i1 %C2, label %F, label %T +T: + call void @foo1( ) + ret void +F: + call void @foo2( ) + ret void +} + +define void @neg_trunc_i1_condition(i32 %V) { +; CHECK-LABEL: @neg_trunc_i1_condition( +; CHECK-NEXT: [[C1:%.*]] = icmp ne i32 [[V:%.*]], 2 +; CHECK-NEXT: [[C2:%.*]] = trunc i32 [[V]] to i1 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[C1]], [[C2]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[F:%.*]], label [[T:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: T: +; CHECK-NEXT: call void @foo1() +; CHECK-NEXT: br label [[COMMON_RET:%.*]] +; CHECK: F: +; CHECK-NEXT: call void @foo2() +; CHECK-NEXT: br label [[COMMON_RET]] +; + %C1 = icmp eq i32 %V, 2 + br i1 %C1, label %T, label %N +N: + %C2 = trunc i32 %V to i1 + br i1 %C2, label %F, label %T +T: + call void @foo1( ) + ret void +F: + call void @foo2( ) + ret void +} + diff --git a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-return.ll b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-return.ll index 57930c91b979..f6d71ddda74f 100644 --- a/llvm/test/Transforms/SimplifyCFG/two-entry-phi-return.ll +++ b/llvm/test/Transforms/SimplifyCFG/two-entry-phi-return.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s +; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -keep-loops="false" -S | FileCheck %s define i1 @qux(ptr %m, ptr %n, ptr %o, ptr %p) nounwind { ; CHECK-LABEL: @qux( diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index d369279c15db..41d77e89476b 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -268,7 +268,7 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext5_ext0_add( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 5 @@ -294,7 +294,7 @@ define float @ext1_ext0_fmul(<4 x float> %x) { ; CHECK-LABEL: @ext1_ext0_fmul( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[SHIFT]], [[X]] -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %e0 = extractelement <4 x float> %x, i32 1 @@ -363,7 +363,7 @@ define float @ext7_ext4_fmul_v8f32(<8 x float> %x) { ; AVX-LABEL: @ext7_ext4_fmul_v8f32( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]] -; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i64 4 +; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 ; AVX-NEXT: ret float [[R]] ; %e0 = extractelement <8 x float> %x, i32 7 @@ -484,7 +484,7 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP1]], [[SHIFT1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[Z0123]] ; %z = and <4 x i32> %x, %y @@ -504,7 +504,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) { ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]] ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] -; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: ret i32 [[X210]] ; %x0 = extractelement <4 x i32> %x, i32 0 @@ -523,7 +523,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[X2Y210]] ; %y0 = extractelement <4 x i32> %y, i32 0 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index d11fb1426c94..4c1ca82b2bd0 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -268,7 +268,7 @@ define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: @ext5_ext0_add( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <16 x i8> [[X:%.*]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = sub nsw <16 x i8> [[SHIFT]], [[Y:%.*]] -; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 ; CHECK-NEXT: ret i8 [[R]] ; %e0 = extractelement <16 x i8> %x, i32 5 @@ -294,7 +294,7 @@ define float @ext1_ext0_fmul(<4 x float> %x) { ; CHECK-LABEL: @ext1_ext0_fmul( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[X:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[SHIFT]], [[X]] -; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i64 0 +; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 ; CHECK-NEXT: ret float [[R]] ; %e0 = extractelement <4 x float> %x, i32 1 @@ -363,7 +363,7 @@ define float @ext7_ext4_fmul_v8f32(<8 x float> %x) { ; AVX-LABEL: @ext7_ext4_fmul_v8f32( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[X:%.*]], <8 x float> poison, <8 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[SHIFT]], [[X]] -; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i64 4 +; AVX-NEXT: [[R:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 ; AVX-NEXT: ret float [[R]] ; %e0 = extractelement <8 x float> %x, i32 7 @@ -490,7 +490,7 @@ define i32 @ext_ext_or_reduction_v4i32(<4 x i32> %x, <4 x i32> %y) { ; CHECK-NEXT: [[TMP2:%.*]] = or <4 x i32> [[TMP1]], [[SHIFT1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[Z0123:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[Z0123]] ; %z = and <4 x i32> %x, %y @@ -510,7 +510,7 @@ define i32 @ext_ext_partial_add_reduction_v4i32(<4 x i32> %x) { ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[X]] ; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] -; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i64 0 +; CHECK-NEXT: [[X210:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 ; CHECK-NEXT: ret i32 [[X210]] ; %x0 = extractelement <4 x i32> %x, i32 0 @@ -529,7 +529,7 @@ define i32 @ext_ext_partial_add_reduction_and_extra_add_v4i32(<4 x i32> %x, <4 x ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHIFT1]], [[TMP1]] ; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHIFT2]], [[TMP2]] -; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i64 0 +; CHECK-NEXT: [[X2Y210:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 ; CHECK-NEXT: ret i32 [[X2Y210]] ; %y0 = extractelement <4 x i32> %y, i32 0 @@ -573,10 +573,8 @@ define i64 @instsimplify_folder_crash(<4 x i64> %in) { ; CHECK-LABEL: @instsimplify_folder_crash( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[SHUFFLE_1:%.*]] = shufflevector <4 x i64> [[IN:%.*]], <4 x i64> zeroinitializer, <4 x i32> -; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i64> zeroinitializer, i64 0 -; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i64> [[SHUFFLE_1]], i64 1 -; CHECK-NEXT: [[OR:%.*]] = or i64 [[E_1]], [[E_0]] -; CHECK-NEXT: ret i64 [[OR]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i64> [[SHUFFLE_1]], <4 x i64> poison, <4 x i32> +; CHECK-NEXT: ret i64 0 ; entry: %shuffle.1 = shufflevector <4 x i64> %in, <4 x i64> zeroinitializer, <4 x i32> diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll index 3dae93665b1e..795832f22b09 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -130,7 +130,7 @@ define i1 @cmp10_v2f64(<2 x double> %x, <2 x double> %y) { ; AVX-LABEL: @cmp10_v2f64( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[X:%.*]], <2 x double> poison, <2 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fcmp ule <2 x double> [[SHIFT]], [[Y:%.*]] -; AVX-NEXT: [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i64 0 +; AVX-NEXT: [[CMP:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 ; AVX-NEXT: ret i1 [[CMP]] ; %x1 = extractelement <2 x double> %x, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll index b26e5ec2698a..50e32b79a91c 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-extractelement-scalarization.ll @@ -27,6 +27,8 @@ define void @multiple_extract(ptr %p) { ; infinite loop if we fold an extract that is waiting to be erased define void @unused_extract(ptr %p) { ; CHECK-LABEL: @unused_extract( +; CHECK-NEXT: [[LOAD:%.*]] = load <4 x float>, ptr [[P:%.*]], align 8 +; CHECK-NEXT: [[EXTRACT:%.*]] = extractelement <4 x float> [[LOAD]], i64 1 ; CHECK-NEXT: ret void ; %load = load <4 x float>, ptr %p, align 8 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index 0c2346e616e3..e8381d1b206e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -253,7 +253,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) % ; CHECK-LABEL: @gep01_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1 ; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 2 -; CHECK-NEXT: ret <8 x i16> [[R]] +; CHECK-NEXT: [[R1:%.*]] = shufflevector <8 x i16> [[R]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R1]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1 %s = load i16, ptr %gep, align 2 @@ -341,7 +342,8 @@ define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) % ; CHECK-LABEL: @gep10_load_i16_insert_v8i16( ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0 ; CHECK-NEXT: [[R:%.*]] = load <8 x i16>, ptr [[GEP]], align 16 -; CHECK-NEXT: ret <8 x i16> [[R]] +; CHECK-NEXT: [[R1:%.*]] = shufflevector <8 x i16> [[R]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[R1]] ; %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0 %s = load i16, ptr %gep, align 16 diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s new file mode 100644 index 000000000000..2ffb52ae61fc --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s + +movq $0x80, %rsp +popq %rax +popq %rcx +popq %rdx +popq %rbx +popq %r12 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 14 +# CHECK-NEXT: Total uOps: 22 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 1.57 +# CHECK-NEXT: IPC: 0.86 +# CHECK-NEXT: Block RThroughput: 2.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 movq $128, %rsp +# CHECK-NEXT: 2 6 0.50 * popq %rax +# CHECK-NEXT: 2 6 0.50 * popq %rcx +# CHECK-NEXT: 2 6 0.50 * popq %rdx +# CHECK-NEXT: 2 6 0.50 * popq %rbx +# CHECK-NEXT: 2 6 0.50 * popq %r12 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 1.50 1.50 2.50 2.50 - 1.50 1.50 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - - - - 0.50 0.50 - movq $128, %rsp +# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rax +# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rcx +# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rdx +# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rbx +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - popq %r12 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeER . . . movq $128, %rsp +# CHECK-NEXT: [0,1] D=eeeeeeER. . popq %rax +# CHECK-NEXT: [0,2] D=eeeeeeER. . popq %rcx +# CHECK-NEXT: [0,3] .D=eeeeeeER . popq %rdx +# CHECK-NEXT: [0,4] .D=eeeeeeER . popq %rbx +# CHECK-NEXT: [0,5] .D==eeeeeeER . popq %r12 +# CHECK-NEXT: [1,0] . DeE------R . movq $128, %rsp +# CHECK-NEXT: [1,1] . D=eeeeeeER . popq %rax +# CHECK-NEXT: [1,2] . D==eeeeeeER. popq %rcx +# CHECK-NEXT: [1,3] . D=eeeeeeER. popq %rdx +# CHECK-NEXT: [1,4] . D==eeeeeeER popq %rbx +# CHECK-NEXT: [1,5] . D==eeeeeeER popq %r12 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 1.0 3.0 movq $128, %rsp +# CHECK-NEXT: 1. 2 2.0 0.0 0.0 popq %rax +# CHECK-NEXT: 2. 2 2.5 0.5 0.0 popq %rcx +# CHECK-NEXT: 3. 2 2.0 1.0 0.0 popq %rdx +# CHECK-NEXT: 4. 2 2.5 1.5 0.0 popq %rbx +# CHECK-NEXT: 5. 2 3.0 2.0 0.0 popq %r12 +# CHECK-NEXT: 2 2.2 1.0 0.5 diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-push.s b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s new file mode 100644 index 000000000000..fc394d4c1e7d --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s + +movq $0x80, %rsp +pushq %rax +pushq %rcx +pushq %rdx +pushq %rbx +pushq %r12 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 15 +# CHECK-NEXT: Total uOps: 32 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 2.13 +# CHECK-NEXT: IPC: 0.80 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 movq $128, %rsp +# CHECK-NEXT: 3 2 1.00 * pushq %rax +# CHECK-NEXT: 3 2 1.00 * pushq %rcx +# CHECK-NEXT: 3 2 1.00 * pushq %rdx +# CHECK-NEXT: 3 2 1.00 * pushq %rbx +# CHECK-NEXT: 3 2 1.00 * pushq %r12 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 1.50 1.50 1.50 1.50 5.00 1.50 1.50 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - - - - - 1.00 - movq $128, %rsp +# CHECK-NEXT: - - 0.50 - 0.50 - 1.00 0.50 - 0.50 pushq %rax +# CHECK-NEXT: - - - 0.50 - 0.50 1.00 - 0.50 0.50 pushq %rcx +# CHECK-NEXT: - - 0.50 - 0.50 0.50 1.00 0.50 - - pushq %rdx +# CHECK-NEXT: - - - 0.50 0.50 - 1.00 0.50 - 0.50 pushq %rbx +# CHECK-NEXT: - - 0.50 0.50 - 0.50 1.00 - - 0.50 pushq %r12 + +# CHECK: Timeline view: +# CHECK-NEXT: 01234 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeER . . . movq $128, %rsp +# CHECK-NEXT: [0,1] D=eeER . . pushq %rax +# CHECK-NEXT: [0,2] .D=eeER . . pushq %rcx +# CHECK-NEXT: [0,3] .D==eeER . . pushq %rdx +# CHECK-NEXT: [0,4] . D==eeER . . pushq %rbx +# CHECK-NEXT: [0,5] . D===eeER. . pushq %r12 +# CHECK-NEXT: [1,0] . DeE---R. . movq $128, %rsp +# CHECK-NEXT: [1,1] . D===eeER . pushq %rax +# CHECK-NEXT: [1,2] . D===eeER . pushq %rcx +# CHECK-NEXT: [1,3] . D====eeER . pushq %rdx +# CHECK-NEXT: [1,4] . D====eeER. pushq %rbx +# CHECK-NEXT: [1,5] . D=====eeER pushq %r12 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 1.0 1.5 movq $128, %rsp +# CHECK-NEXT: 1. 2 3.0 0.5 0.0 pushq %rax +# CHECK-NEXT: 2. 2 3.0 1.0 0.0 pushq %rcx +# CHECK-NEXT: 3. 2 4.0 1.0 0.0 pushq %rdx +# CHECK-NEXT: 4. 2 4.0 1.0 0.0 pushq %rbx +# CHECK-NEXT: 5. 2 5.0 1.0 0.0 pushq %r12 +# CHECK-NEXT: 2 3.3 0.9 0.3 diff --git a/llvm/test/tools/llvm-objdump/DXContainer/input-output-signatures.yaml b/llvm/test/tools/llvm-objdump/DXContainer/input-output-signatures.yaml new file mode 100644 index 000000000000..ad979d2dcb7e --- /dev/null +++ b/llvm/test/tools/llvm-objdump/DXContainer/input-output-signatures.yaml @@ -0,0 +1,167 @@ +# RUN: yaml2obj %s -o %t +# RUN: llvm-objdump -p %t | FileCheck %s --match-full-lines --strict-whitespace + +## This test covers llvm-objdump printing private headers for the ISG1, OSG1, +## and PSG1 "parts" of the DX container file format. The test uses a few +## absurdly large values and long string names to ensure that the columns in the +## printed table widen correctly. + +--- !dxcontainer +Header: + Hash: [ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 ] + Version: + Major: 1 + Minor: 0 + FileSize: 630 + PartCount: 3 + PartOffsets: [ 64, 124, 184 ] +Parts: + - Name: ISG1 + Size: 52 + Signature: + Parameters: + - Stream: 0 + Name: AAA_HSFoo + Index: 4391238 # This value forces the index column to widen + SystemValue: Undefined + CompType: Float32 + Register: 0 + Mask: 7 + ExclusiveMask: 2 + MinPrecision: Default + - Name: OSG1 + Size: 52 + Signature: + Parameters: + - Stream: 0 + Name: SV_Position + Index: 0 + SystemValue: Position + CompType: Float32 + Register: 2147483647 # This value forces the register column to widen + Mask: 15 + ExclusiveMask: 0 + MinPrecision: Default + - Name: PSG1 + Size: 402 + Signature: + Parameters: + - Stream: 0 + Name: SV_TessFactor + Index: 0 + SystemValue: FinalQuadEdgeTessfactor # The tessfactor forces the SysVal column to widen + CompType: Float32 + Register: 0 + Mask: 8 + ExclusiveMask: 8 + MinPrecision: Default + - Stream: 0 + Name: BBB + Index: 0 + SystemValue: Undefined + CompType: Float32 + Register: 0 + Mask: 7 + ExclusiveMask: 0 + MinPrecision: Default + - Stream: 0 + Name: SV_TessFactor + Index: 1 + SystemValue: FinalQuadEdgeTessfactor + CompType: Float32 + Register: 1 + Mask: 8 + ExclusiveMask: 8 + MinPrecision: Default + - Stream: 0 + Name: BBB + Index: 1 + SystemValue: Undefined + CompType: Float32 + Register: 1 + Mask: 7 + ExclusiveMask: 0 + MinPrecision: Default + - Stream: 0 + Name: SV_TessFactor + Index: 2 + SystemValue: FinalQuadEdgeTessfactor + CompType: Float32 + Register: 2 + Mask: 8 + ExclusiveMask: 8 + MinPrecision: Default + - Stream: 0 + Name: BBB + Index: 2 + SystemValue: Undefined + CompType: Float32 + Register: 2 + Mask: 7 + ExclusiveMask: 0 + MinPrecision: Default + - Stream: 0 + Name: SV_TessFactor + Index: 3 + SystemValue: FinalQuadEdgeTessfactor + CompType: Float32 + Register: 3 + Mask: 8 + ExclusiveMask: 8 + MinPrecision: Default + - Stream: 0 + Name: SV_InsideTessFactor + Index: 0 + SystemValue: FinalQuadInsideTessfactor + CompType: Float32 + Register: 4 + Mask: 8 + ExclusiveMask: 0 + MinPrecision: Default + - Stream: 0 + Name: SV_InsideTessFactor + Index: 1 + SystemValue: FinalQuadInsideTessfactor + CompType: Float32 + Register: 5 + Mask: 8 + ExclusiveMask: 0 + MinPrecision: Default + - Stream: 0 + Name: AVeryLongStringThatWillForceWidening # This value forces name column to widen + Index: 0 + SystemValue: Undefined + CompType: Float32 + Register: 6 + Mask: 15 + ExclusiveMask: 4 + MinPrecision: Default +... + +# CHECK:; Input signature: +# CHECK-NEXT:; +# CHECK-NEXT:; Name Index Mask Register SysValue Format Used +# CHECK-NEXT:; ------------------------ ------- ----- -------- ---------- ------- ----- +# CHECK-NEXT:; AAA_HSFoo 4391238 xyz 0 Undefined Float32 y + +# CHECK:; Output signature: +# CHECK-NEXT:; +# CHECK-NEXT:; Name Index Mask Register SysValue Format Used +# CHECK-NEXT:; ------------------------ ----- ----- ---------- ---------- ------- ----- +# CHECK-NEXT:; SV_Position 0 xyzw 2147483647 Position Float32 + +# CHECK:; Patch Constant signature: +# CHECK-NEXT:; +# CHECK-NEXT:; Name Index Mask Register SysValue Format Used +# CHECK-NEXT:; ------------------------------------ ----- ----- -------- ------------------------- ------- ----- +# CHECK-NEXT:; SV_TessFactor 0 w 0 FinalQuadEdgeTessfactor Float32 w +# CHECK-NEXT:; BBB 0 xyz 0 Undefined Float32 +# CHECK-NEXT:; SV_TessFactor 1 w 1 FinalQuadEdgeTessfactor Float32 w +# CHECK-NEXT:; BBB 1 xyz 1 Undefined Float32 +# CHECK-NEXT:; SV_TessFactor 2 w 2 FinalQuadEdgeTessfactor Float32 w +# CHECK-NEXT:; BBB 2 xyz 2 Undefined Float32 +# CHECK-NEXT:; SV_TessFactor 3 w 3 FinalQuadEdgeTessfactor Float32 w +# CHECK-NEXT:; SV_InsideTessFactor 0 w 4 FinalQuadInsideTessfactor Float32 +# CHECK-NEXT:; SV_InsideTessFactor 1 w 5 FinalQuadInsideTessfactor Float32 +# CHECK-NEXT:; AVeryLongStringThatWillForceWidening 0 xyzw 6 Undefined Float32 z diff --git a/llvm/tools/llvm-objdump/DXContainerDump.cpp b/llvm/tools/llvm-objdump/DXContainerDump.cpp index 2fb073473de5..52963e0f7d1b 100644 --- a/llvm/tools/llvm-objdump/DXContainerDump.cpp +++ b/llvm/tools/llvm-objdump/DXContainerDump.cpp @@ -12,16 +12,152 @@ //===----------------------------------------------------------------------===// #include "llvm-objdump.h" +#include "llvm/BinaryFormat/DXContainer.h" #include "llvm/Object/DXContainer.h" +#include "llvm/Support/ScopedPrinter.h" using namespace llvm; +using namespace llvm::object; + +static llvm::SmallString<4> maskToString(uint8_t Mask, + bool StripTrailing = false) { + llvm::SmallString<4> Result(" "); + if (Mask & 1) + Result[0] = 'x'; + if (Mask & 2) + Result[1] = 'y'; + if (Mask & 4) + Result[2] = 'z'; + if (Mask & 8) + Result[3] = 'w'; + if (!StripTrailing) + return Result; + int Size = 8 - countl_zero(Mask); + return Result.slice(0, Size); +} + +static void printColumnHeader(raw_ostream &OS, size_t Length) { + for (size_t I = 0; I < Length; ++I) + OS << "-"; +} + +static void printColumnHeaders(raw_ostream &OS, ArrayRef Lengths) { + // Generate the header in a temporary to avoid trailing whitespace. + SmallString<256> Str; + raw_svector_ostream Tmp(Str); + for (auto L : Lengths) { + printColumnHeader(Tmp, L); + Tmp << " "; + } + Str.back() = '\n'; + OS << Str; +} + +static size_t digitsForNumber(size_t N) { + if (N == 0) + return 1; + return static_cast(log10(static_cast(N))) + 1; +} namespace { class DXContainerDumper : public objdump::Dumper { + const DXContainerObjectFile &Obj; + public: - DXContainerDumper(const object::DXContainerObjectFile &Obj) - : objdump::Dumper(Obj) {} + DXContainerDumper(const DXContainerObjectFile &O) + : objdump::Dumper(O), Obj(O) {} + + void printPrivateHeaders() override; + void printSignature(const DirectX::Signature &S); }; + +void DXContainerDumper::printSignature(const DirectX::Signature &S) { + // DXC prints a table like this as part of the shader disassembly: + //; Name Index Mask Register SysValue Format Used + //; -------------------- ----- ------ -------- -------- ------- ------ + //; NORMAL 0 xyz 0 NONE float xyz + //; TEXCOORD 0 xy 1 NONE float xy + + // DXC's implementation doesn't scale columns entirely completely for the + // provided input, so this implementation is a bit more complicated in + // formatting logic to scale with the size of the printed text. + + // DXC gives names 21 characters for some unknown reason, I arbitrarily chose + // to start at 24 so that we're not going shorter but are using a round + // number. + size_t LongestName = 24; + size_t LongestSV = 10; + size_t LongestIndex = strlen("Index"); + size_t LongestRegister = strlen("Register"); + size_t LongestFormat = strlen("Format"); + const size_t MaskWidth = 5; + // Compute the column widths. Skip calculating the "Mask" and "Used" columns + // since they both have widths of 4. + for (auto El : S) { + LongestName = std::max(LongestName, S.getName(El.NameOffset).size()); + LongestSV = std::max( + LongestSV, + enumToStringRef(El.SystemValue, dxbc::getD3DSystemValues()).size()); + LongestIndex = std::max(LongestIndex, digitsForNumber(El.Index)); + LongestRegister = std::max(LongestRegister, digitsForNumber(El.Register)); + LongestFormat = std::max( + LongestFormat, + enumToStringRef(El.CompType, dxbc::getSigComponentTypes()).size()); + } + + // Print Column headers. + OS << "; "; + OS << left_justify("Name", LongestName) << " "; + OS << right_justify("Index", LongestIndex) << " "; + OS << right_justify("Mask", MaskWidth) << " "; + OS << right_justify("Register", LongestRegister) << " "; + OS << right_justify("SysValue", LongestSV) << " "; + OS << right_justify("Format", LongestFormat) << " "; + OS << right_justify("Used", MaskWidth) << "\n"; + OS << "; "; + printColumnHeaders(OS, {LongestName, LongestIndex, MaskWidth, LongestRegister, + LongestSV, LongestFormat, MaskWidth}); + + for (auto El : S) { + OS << "; " << left_justify(S.getName(El.NameOffset), LongestName) << " "; + OS << right_justify(std::to_string(El.Index), LongestIndex) << " "; + OS << right_justify(maskToString(El.Mask), MaskWidth) << " "; + OS << right_justify(std::to_string(El.Register), LongestRegister) << " "; + OS << right_justify( + enumToStringRef(El.SystemValue, dxbc::getD3DSystemValues()), + LongestSV) + << " "; + OS << right_justify( + enumToStringRef(El.CompType, dxbc::getSigComponentTypes()), + LongestFormat); + if (El.ExclusiveMask) + OS << " " << maskToString(El.ExclusiveMask, true); + OS << "\n"; + } +} + +void DXContainerDumper::printPrivateHeaders() { + const DXContainer &C = + cast(Obj).getDXContainer(); + + if (!C.getInputSignature().isEmpty()) { + OS << "; Input signature:\n;\n"; + printSignature(C.getInputSignature()); + OS << ";\n"; + } + + if (!C.getOutputSignature().isEmpty()) { + OS << "; Output signature:\n;\n"; + printSignature(C.getOutputSignature()); + OS << ";\n"; + } + + if (!C.getPatchConstantSignature().isEmpty()) { + OS << "; Patch Constant signature:\n;\n"; + printSignature(C.getPatchConstantSignature()); + OS << ";\n"; + } +} } // namespace std::unique_ptr llvm::objdump::createDXContainerDumper( diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index c1a2c38ea9b7..ade025fd3d89 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -6512,12 +6512,13 @@ void ELFDumper::printSFrameFDEs( { DictScope InfoScope(W, "Info"); - W.printEnum("FRE Type", It->getFREType(), sframe::getFRETypes()); - W.printEnum("FDE Type", It->getFDEType(), sframe::getFDETypes()); + W.printEnum("FRE Type", It->Info.getFREType(), sframe::getFRETypes()); + W.printEnum("FDE Type", It->Info.getFDEType(), sframe::getFDETypes()); switch (Parser.getHeader().ABIArch) { case sframe::ABI::AArch64EndianBig: case sframe::ABI::AArch64EndianLittle: - W.printEnum("PAuth Key", sframe::AArch64PAuthKey(It->getPAuthKey()), + W.printEnum("PAuth Key", + sframe::AArch64PAuthKey(It->Info.getPAuthKey()), sframe::getAArch64PAuthKeys()); break; case sframe::ABI::AMD64EndianLittle: @@ -6525,12 +6526,13 @@ void ELFDumper::printSFrameFDEs( break; } - W.printHex("Raw", It->Info); + W.printHex("Raw", It->Info.Info); } W.printHex( ("Repetitive block size" + - Twine(It->getFDEType() == sframe::FDEType::PCMask ? "" : " (unused)")) + Twine(It->Info.getFDEType() == sframe::FDEType::PCMask ? "" + : " (unused)")) .str(), It->RepSize); @@ -6541,10 +6543,11 @@ void ELFDumper::printSFrameFDEs( for (const typename SFrameParser::FrameRowEntry &FRE : Parser.fres(*It, Err)) { DictScope FREScope(W, "Frame Row Entry"); - W.printHex( - "Start Address", - (It->getFDEType() == sframe::FDEType::PCInc ? FDEStartAddress : 0) + - FRE.StartAddress); + W.printHex("Start Address", + (It->Info.getFDEType() == sframe::FDEType::PCInc + ? FDEStartAddress + : 0) + + FRE.StartAddress); W.printBoolean("Return Address Signed", FRE.Info.isReturnAddressSigned()); W.printEnum("Offset Size", FRE.Info.getOffsetSize(), sframe::getFREOffsets()); diff --git a/llvm/unittests/BinaryFormat/MsgPackDocumentTest.cpp b/llvm/unittests/BinaryFormat/MsgPackDocumentTest.cpp index a8db0f1ad0cc..6a6ad7010f62 100644 --- a/llvm/unittests/BinaryFormat/MsgPackDocumentTest.cpp +++ b/llvm/unittests/BinaryFormat/MsgPackDocumentTest.cpp @@ -22,12 +22,58 @@ TEST(MsgPackDocument, DocNodeTest) { ASSERT_TRUE(Str1 == Str2); } -TEST(MsgPackDocument, TestReadInt) { - Document Doc; - bool Ok = Doc.readFromBlob(StringRef("\xd0\x00", 2), /*Multi=*/false); +TEST(MsgPackDocument, TestReadBoolean) { + Document Doc1; + bool Ok = Doc1.readFromBlob(StringRef("\xC2", 1), /*Multi=*/false); ASSERT_TRUE(Ok); - ASSERT_EQ(Doc.getRoot().getKind(), Type::Int); - ASSERT_EQ(Doc.getRoot().getInt(), 0); + ASSERT_EQ(Doc1.getRoot().getKind(), Type::Boolean); + ASSERT_EQ(Doc1.getRoot().getBool(), false); + Document Doc2; + Ok = Doc2.readFromBlob(StringRef("\xC3", 1), /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc2.getRoot().getKind(), Type::Boolean); + ASSERT_EQ(Doc2.getRoot().getBool(), true); +} + +TEST(MsgPackDocument, TestReadInt) { + Document Doc1; + bool Ok = Doc1.readFromBlob(StringRef("\xD0\x00", 2), /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc1.getRoot().getKind(), Type::Int); + ASSERT_EQ(Doc1.getRoot().getInt(), 0); + Document Doc2; + Ok = Doc2.readFromBlob(StringRef("\xFF", 1), /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc2.getRoot().getKind(), Type::Int); + ASSERT_EQ(Doc2.getRoot().getInt(), -1); +} + +TEST(MsgPackDocument, TestReadUInt) { + Document Doc1; + bool Ok = Doc1.readFromBlob(StringRef("\xCC\x00", 2), /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc1.getRoot().getKind(), Type::UInt); + ASSERT_EQ(Doc1.getRoot().getUInt(), 0U); + Document Doc2; + Ok = Doc2.readFromBlob(StringRef("\x01", 1), /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc2.getRoot().getKind(), Type::UInt); + ASSERT_EQ(Doc2.getRoot().getUInt(), 1U); +} + +TEST(MsgPackDocument, TestReadFloat) { + Document Doc1; + bool Ok = + Doc1.readFromBlob(StringRef("\xCA\x3F\x80\x00\x00", 5), /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc1.getRoot().getKind(), Type::Float); + ASSERT_EQ(Doc1.getRoot().getFloat(), 1.0); + Document Doc2; + Ok = Doc2.readFromBlob(StringRef("\xCB\x48\x3D\x63\x29\xF1\xC3\x5C\xA5", 9), + /*Multi=*/false); + ASSERT_TRUE(Ok); + ASSERT_EQ(Doc2.getRoot().getKind(), Type::Float); + ASSERT_EQ(Doc2.getRoot().getFloat(), 1e40); } TEST(MsgPackDocument, TestReadBinary) { @@ -192,12 +238,54 @@ TEST(MsgPackDocument, TestReadMergeMap) { ASSERT_EQ(BayS.getInt(), 8); } +TEST(MsgPackDocument, TestWriteBoolean) { + Document Doc; + Doc.getRoot() = true; + std::string Buffer; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, "\xc3"); + Doc.getRoot() = false; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, "\xc2"); +} + TEST(MsgPackDocument, TestWriteInt) { Document Doc; Doc.getRoot() = 1; std::string Buffer; Doc.writeToBlob(Buffer); ASSERT_EQ(Buffer, "\x01"); + Doc.getRoot() = -1; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, "\xFF"); + Doc.getRoot() = -4096; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, StringRef("\xD1\xF0\x00", 3)); +} + +TEST(MsgPackDocument, TestWriteUInt) { + Document Doc; + Doc.getRoot() = 1U; + std::string Buffer; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, "\x01"); + Doc.getRoot() = 4096U; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, StringRef("\xCD\x10\x00", 3)); +} + +TEST(MsgPackDocument, TestWriteFloat) { + Document Doc; + Doc.getRoot() = 1.0; + std::string Buffer; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, StringRef("\xCA\x3F\x80\x00\x00", 5)); + Doc.getRoot() = 1.0f; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, StringRef("\xCA\x3F\x80\x00\x00", 5)); + Doc.getRoot() = 1e40; + Doc.writeToBlob(Buffer); + ASSERT_EQ(Buffer, "\xCB\x48\x3D\x63\x29\xF1\xC3\x5C\xA5"); } TEST(MsgPackDocument, TestWriteBinary) { diff --git a/llvm/unittests/BinaryFormat/SFrameTest.cpp b/llvm/unittests/BinaryFormat/SFrameTest.cpp index 394e382e041e..ab7b0fe20b75 100644 --- a/llvm/unittests/BinaryFormat/SFrameTest.cpp +++ b/llvm/unittests/BinaryFormat/SFrameTest.cpp @@ -54,28 +54,28 @@ TYPED_TEST_SUITE(SFrameTest, Types, NameGenerator); TYPED_TEST(SFrameTest, FDEFlags) { FuncDescEntry FDE = {}; - EXPECT_EQ(FDE.Info, 0u); - EXPECT_EQ(FDE.getPAuthKey(), 0); - EXPECT_EQ(FDE.getFDEType(), FDEType::PCInc); - EXPECT_EQ(FDE.getFREType(), FREType::Addr1); + EXPECT_EQ(FDE.Info.Info, 0u); + EXPECT_EQ(FDE.Info.getPAuthKey(), 0); + EXPECT_EQ(FDE.Info.getFDEType(), FDEType::PCInc); + EXPECT_EQ(FDE.Info.getFREType(), FREType::Addr1); - FDE.setPAuthKey(1); - EXPECT_EQ(FDE.Info, 0x20u); - EXPECT_EQ(FDE.getPAuthKey(), 1); - EXPECT_EQ(FDE.getFDEType(), FDEType::PCInc); - EXPECT_EQ(FDE.getFREType(), FREType::Addr1); + FDE.Info.setPAuthKey(1); + EXPECT_EQ(FDE.Info.Info, 0x20u); + EXPECT_EQ(FDE.Info.getPAuthKey(), 1); + EXPECT_EQ(FDE.Info.getFDEType(), FDEType::PCInc); + EXPECT_EQ(FDE.Info.getFREType(), FREType::Addr1); - FDE.setFDEType(FDEType::PCMask); - EXPECT_EQ(FDE.Info, 0x30u); - EXPECT_EQ(FDE.getPAuthKey(), 1); - EXPECT_EQ(FDE.getFDEType(), FDEType::PCMask); - EXPECT_EQ(FDE.getFREType(), FREType::Addr1); + FDE.Info.setFDEType(FDEType::PCMask); + EXPECT_EQ(FDE.Info.Info, 0x30u); + EXPECT_EQ(FDE.Info.getPAuthKey(), 1); + EXPECT_EQ(FDE.Info.getFDEType(), FDEType::PCMask); + EXPECT_EQ(FDE.Info.getFREType(), FREType::Addr1); - FDE.setFREType(FREType::Addr4); - EXPECT_EQ(FDE.Info, 0x32u); - EXPECT_EQ(FDE.getPAuthKey(), 1); - EXPECT_EQ(FDE.getFDEType(), FDEType::PCMask); - EXPECT_EQ(FDE.getFREType(), FREType::Addr4); + FDE.Info.setFREType(FREType::Addr4); + EXPECT_EQ(FDE.Info.Info, 0x32u); + EXPECT_EQ(FDE.Info.getPAuthKey(), 1); + EXPECT_EQ(FDE.Info.getFDEType(), FDEType::PCMask); + EXPECT_EQ(FDE.Info.getFREType(), FREType::Addr4); } TYPED_TEST(SFrameTest, FREFlags) { diff --git a/llvm/unittests/CAS/ObjectStoreTest.cpp b/llvm/unittests/CAS/ObjectStoreTest.cpp index b3c408758a00..e84e30374c9a 100644 --- a/llvm/unittests/CAS/ObjectStoreTest.cpp +++ b/llvm/unittests/CAS/ObjectStoreTest.cpp @@ -269,7 +269,8 @@ TEST_P(CASTest, NodesBig) { ASSERT_THAT_ERROR(CAS->validate(CAS->getID(ID)), Succeeded()); } -#if LLVM_ENABLE_THREADS +// FIXME: Re-enable the test. +#if 0 /// Common test functionality for creating blobs in parallel. You can vary which /// cas instances are the same or different, and the size of the created blobs. static void testBlobsParallel(ObjectStore &Read1, ObjectStore &Read2, diff --git a/llvm/unittests/CAS/ProgramTest.cpp b/llvm/unittests/CAS/ProgramTest.cpp index efb3b98d970a..578ccf85e1df 100644 --- a/llvm/unittests/CAS/ProgramTest.cpp +++ b/llvm/unittests/CAS/ProgramTest.cpp @@ -1,4 +1,4 @@ -//===- MappedFileRegionBumpPtrTest.cpp ------------------------------------===// +//===----------------------------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp index 98b33fdfb8c1..1eb03f16527e 100644 --- a/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp +++ b/llvm/unittests/Frontend/HLSLRootSignatureDumpTest.cpp @@ -10,12 +10,13 @@ #include "gtest/gtest.h" using namespace llvm::hlsl::rootsig; +using llvm::dxil::ResourceClass; namespace { TEST(HLSLRootSignatureTest, DescriptorCBVClauseDump) { DescriptorTableClause Clause; - Clause.Type = ClauseType::CBuffer; + Clause.Type = ResourceClass::CBuffer; Clause.Reg = {RegisterType::BReg, 0}; Clause.setDefaultFlags(llvm::dxbc::RootSignatureVersion::V1_1); @@ -32,7 +33,7 @@ TEST(HLSLRootSignatureTest, DescriptorCBVClauseDump) { TEST(HLSLRootSignatureTest, DescriptorSRVClauseDump) { DescriptorTableClause Clause; - Clause.Type = ClauseType::SRV; + Clause.Type = ResourceClass::SRV; Clause.Reg = {RegisterType::TReg, 0}; Clause.NumDescriptors = NumDescriptorsUnbounded; Clause.Space = 42; @@ -52,7 +53,7 @@ TEST(HLSLRootSignatureTest, DescriptorSRVClauseDump) { TEST(HLSLRootSignatureTest, DescriptorUAVClauseDump) { using llvm::dxbc::DescriptorRangeFlags; DescriptorTableClause Clause; - Clause.Type = ClauseType::UAV; + Clause.Type = ResourceClass::UAV; Clause.Reg = {RegisterType::UReg, 92374}; Clause.NumDescriptors = 3298; Clause.Space = 932847; @@ -82,7 +83,7 @@ TEST(HLSLRootSignatureTest, DescriptorUAVClauseDump) { TEST(HLSLRootSignatureTest, DescriptorSamplerClauseDump) { DescriptorTableClause Clause; - Clause.Type = ClauseType::Sampler; + Clause.Type = ResourceClass::Sampler; Clause.Reg = {RegisterType::SReg, 0}; Clause.NumDescriptors = 2; Clause.Space = 42; @@ -102,7 +103,7 @@ TEST(HLSLRootSignatureTest, DescriptorSamplerClauseDump) { TEST(HLSLRootSignatureTest, DescriptorCBVV10ClauseDump) { DescriptorTableClause Clause; - Clause.Type = ClauseType::CBuffer; + Clause.Type = ResourceClass::CBuffer; Clause.Reg = {RegisterType::BReg, 0}; Clause.setDefaultFlags(llvm::dxbc::RootSignatureVersion::V1_0); @@ -119,7 +120,7 @@ TEST(HLSLRootSignatureTest, DescriptorCBVV10ClauseDump) { TEST(HLSLRootSignatureTest, DescriptorSamplerV10ClauseDump) { DescriptorTableClause Clause; - Clause.Type = ClauseType::Sampler; + Clause.Type = ResourceClass::Sampler; Clause.Reg = {RegisterType::SReg, 0}; Clause.setDefaultFlags(llvm::dxbc::RootSignatureVersion::V1_0); @@ -151,7 +152,7 @@ TEST(HLSLRootSignatureTest, DescriptorTableDump) { TEST(HLSLRootSignatureTest, RootCBVDump) { RootDescriptor Descriptor; - Descriptor.Type = DescriptorType::CBuffer; + Descriptor.Type = ResourceClass::CBuffer; Descriptor.Reg = {RegisterType::BReg, 0}; Descriptor.setDefaultFlags(llvm::dxbc::RootSignatureVersion::V1_1); @@ -168,7 +169,7 @@ TEST(HLSLRootSignatureTest, RootCBVDump) { TEST(HLSLRootSignatureTest, RootSRV10Dump) { RootDescriptor Descriptor; - Descriptor.Type = DescriptorType::SRV; + Descriptor.Type = ResourceClass::SRV; Descriptor.Reg = {RegisterType::TReg, 0}; Descriptor.setDefaultFlags(llvm::dxbc::RootSignatureVersion::V1_0); @@ -185,7 +186,7 @@ TEST(HLSLRootSignatureTest, RootSRV10Dump) { TEST(HLSLRootSignatureTest, RootUAVV10Dump) { RootDescriptor Descriptor; - Descriptor.Type = DescriptorType::UAV; + Descriptor.Type = ResourceClass::UAV; Descriptor.Reg = {RegisterType::UReg, 0}; Descriptor.setDefaultFlags(llvm::dxbc::RootSignatureVersion::V1_0); @@ -202,7 +203,7 @@ TEST(HLSLRootSignatureTest, RootUAVV10Dump) { TEST(HLSLRootSignatureTest, RootSRVDump) { RootDescriptor Descriptor; - Descriptor.Type = DescriptorType::SRV; + Descriptor.Type = ResourceClass::SRV; Descriptor.Reg = {RegisterType::TReg, 0}; Descriptor.Space = 42; Descriptor.Visibility = llvm::dxbc::ShaderVisibility::Geometry; @@ -221,7 +222,7 @@ TEST(HLSLRootSignatureTest, RootSRVDump) { TEST(HLSLRootSignatureTest, RootUAVDump) { using llvm::dxbc::RootDescriptorFlags; RootDescriptor Descriptor; - Descriptor.Type = DescriptorType::UAV; + Descriptor.Type = ResourceClass::UAV; Descriptor.Reg = {RegisterType::UReg, 92374}; Descriptor.Space = 932847; Descriptor.Visibility = llvm::dxbc::ShaderVisibility::Hull; diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp index f13252f3a4c2..e90f733d79fc 100644 --- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp +++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp @@ -78,7 +78,7 @@ TEST(SMEAttributes, Constructors) { "ret void\n}"); CallBase &Call = cast((CallModule->getFunction("foo")->begin()->front())); - ASSERT_TRUE(SMECallAttrs(Call).callsite().hasUndefZT0()); + ASSERT_TRUE(SMECallAttrs(Call, nullptr).callsite().hasUndefZT0()); // Invalid combinations. EXPECT_DEBUG_DEATH(SA(SA::SM_Enabled | SA::SM_Compatible), diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp index 319538eaea13..ff0af566b05b 100644 --- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp +++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp @@ -1099,6 +1099,7 @@ R"(All available -march extensions for RISC-V smcdeleg 1.0 smcntrpmf 1.0 smcsrind 1.0 + smctr 1.0 smdbltrp 1.0 smepmp 1.0 smmpm 1.0 @@ -1111,6 +1112,7 @@ R"(All available -march extensions for RISC-V sscofpmf 1.0 sscounterenw 1.0 sscsrind 1.0 + ssctr 1.0 ssdbltrp 1.0 ssnpm 1.0 sspm 1.0 @@ -1163,6 +1165,7 @@ R"(All available -march extensions for RISC-V xsfvqmaccqoq 1.0 xsifivecdiscarddlone 1.0 xsifivecflushdlone 1.0 + xsmtvdot 1.0 xtheadba 1.0 xtheadbb 1.0 xtheadbs 1.0 @@ -1185,8 +1188,6 @@ Experimental extensions zvbc32e 0.7 zvkgs 0.7 zvqdotq 0.0 - smctr 1.0 - ssctr 1.0 svukte 0.3 xqccmp 0.3 xqcia 0.7 diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index b22c60a00081..238c87a196ea 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -192,6 +192,17 @@ struct DecoderTableInfo { DecoderSet Decoders; bool isOutermostScope() const { return FixupStack.size() == 1; } + + void pushScope() { FixupStack.emplace_back(); } + + void popScope() { + // Resolve any remaining fixups in the current scope before popping it. + // All fixups resolve to the current location. + uint32_t DestIdx = Table.size(); + for (uint32_t FixupIdx : FixupStack.back()) + Table.patchNumToSkip(FixupIdx, DestIdx); + FixupStack.pop_back(); + } }; struct EncodingAndInst { @@ -204,16 +215,6 @@ struct EncodingAndInst { : EncodingDef(EncodingDef), Inst(Inst), HwModeName(HwModeName) {} }; -struct EncodingIDAndOpcode { - unsigned EncodingID; - unsigned Opcode; - - EncodingIDAndOpcode() : EncodingID(0), Opcode(0) {} - EncodingIDAndOpcode(unsigned EncodingID, unsigned Opcode) - : EncodingID(EncodingID), Opcode(Opcode) {} -}; - -using EncodingIDsVec = std::vector; using NamespacesHwModesMap = std::map>; class DecoderEmitter { @@ -224,11 +225,13 @@ public: DecoderEmitter(const RecordKeeper &R, StringRef PredicateNamespace) : RK(R), Target(R), PredicateNamespace(PredicateNamespace) {} + const CodeGenTarget &getTarget() const { return Target; } + // Emit the decoder state machine table. Returns a mask of MCD decoder ops // that were emitted. unsigned emitTable(formatted_raw_ostream &OS, DecoderTable &Table, unsigned BitWidth, StringRef Namespace, - const EncodingIDsVec &EncodingIDs) const; + ArrayRef EncodingIDs) const; void emitInstrLenTable(formatted_raw_ostream &OS, ArrayRef InstrLen) const; void emitPredicateFunction(formatted_raw_ostream &OS, @@ -308,8 +311,8 @@ static raw_ostream &operator<<(raw_ostream &OS, const EncodingAndInst &Value) { } // Prints the bit value for each position. -static void dumpBits(raw_ostream &OS, const BitsInit &Bits) { - for (const Init *Bit : reverse(Bits.getBits())) +static void dumpBits(raw_ostream &OS, const BitsInit &Bits, unsigned BitWidth) { + for (const Init *Bit : reverse(Bits.getBits().take_front(BitWidth))) OS << BitValue(Bit); } @@ -337,10 +340,25 @@ static const BitsInit &getBitsField(const Record &Def, StringRef FieldName) { // Representation of the instruction to work on. typedef std::vector insn_t; -namespace { +/// Extracts a NumBits long field from Insn, starting from StartBit. +/// Returns the value of the field if all bits are well-known, +/// otherwise std::nullopt. +static std::optional +fieldFromInsn(const insn_t &Insn, unsigned StartBit, unsigned NumBits) { + uint64_t Field = 0; -static constexpr uint64_t NO_FIXED_SEGMENTS_SENTINEL = - std::numeric_limits::max(); + for (unsigned BitIndex = 0; BitIndex < NumBits; ++BitIndex) { + if (Insn[StartBit + BitIndex] == BitValue::BIT_UNSET) + return std::nullopt; + + if (Insn[StartBit + BitIndex] == BitValue::BIT_TRUE) + Field = Field | (1ULL << BitIndex); + } + + return Field; +} + +namespace { class FilterChooser; @@ -360,16 +378,16 @@ class FilterChooser; /// /// An example of a conflict is /// -/// Conflict: -/// 111101000.00........00010000.... -/// 111101000.00........0001........ -/// 1111010...00........0001........ -/// 1111010...00.................... -/// 1111010......................... -/// 1111............................ -/// ................................ -/// VST4q8a 111101000_00________00010000____ -/// VST4q8b 111101000_00________00010000____ +/// Decoding Conflict: +/// ................................ +/// 1111............................ +/// 1111010......................... +/// 1111010...00.................... +/// 1111010...00........0001........ +/// 111101000.00........0001........ +/// 111101000.00........00010000.... +/// 111101000_00________00010000____ VST4q8a +/// 111101000_00________00010000____ VST4q8b /// /// The Debug output shows the path that the decoding tree follows to reach the /// the conclusion that there is a conflict. VST4q8a is a vst4 to double-spaced @@ -387,20 +405,20 @@ protected: unsigned NumBits; // number of bits to filter // Map of well-known segment value to the set of uid's with that value. - std::map> FilteredInstructions; + std::map> FilteredIDs; // Set of uid's with non-constant segment values. - std::vector VariableInstructions; + std::vector VariableIDs; // Map of well-known segment value to its delegate. std::map> FilterChooserMap; + // A filter chooser for encodings that contain some '?' in the filtered range. + std::unique_ptr VariableFC; + // Number of instructions which fall under FilteredInstructions category. unsigned NumFiltered; - // Keeps track of the last opcode in the filtered bucket. - EncodingIDAndOpcode LastOpcFiltered; - public: Filter(Filter &&f); Filter(const FilterChooser &owner, unsigned startBit, unsigned numBits); @@ -409,16 +427,16 @@ public: unsigned getNumFiltered() const { return NumFiltered; } - EncodingIDAndOpcode getSingletonOpc() const { + unsigned getSingletonEncodingID() const { assert(NumFiltered == 1); - return LastOpcFiltered; + return FilteredIDs.begin()->second.front(); } // Return the filter chooser for the group of instructions without constant // segment values. const FilterChooser &getVariableFC() const { - assert(NumFiltered == 1 && FilterChooserMap.size() == 1); - return *(FilterChooserMap.find(NO_FIXED_SEGMENTS_SENTINEL)->second); + assert(NumFiltered == 1 && FilterChooserMap.empty()); + return *VariableFC; } // Divides the decoding task into sub tasks and delegates them to the @@ -472,15 +490,13 @@ protected: ArrayRef AllInstructions; // Vector of uid's for this filter chooser to work on. - // The first member of the pair is the opcode id being decoded, the second is - // the opcode id that should be emitted. - ArrayRef Opcodes; + ArrayRef EncodingIDs; // Lookup table for the operand decoding of instructions. const std::map> &Operands; - // Vector of candidate filters. - std::vector Filters; + // The selected filter, if any. + std::unique_ptr BestFilter; // Array of bit values passed down from our parent. // Set to all BIT_UNFILTERED's for Parent == NULL. @@ -489,9 +505,6 @@ protected: // Links to the FilterChooser above us in the decoding tree. const FilterChooser *Parent; - // Index of the best filter from Filters. - int BestIndex; - // Width of instructions unsigned BitWidth; @@ -505,23 +518,21 @@ protected: }; public: - FilterChooser(ArrayRef Insts, - ArrayRef IDs, + FilterChooser(ArrayRef Insts, ArrayRef EncodingIDs, const std::map> &Ops, unsigned BW, const DecoderEmitter *E) - : AllInstructions(Insts), Opcodes(IDs), Operands(Ops), + : AllInstructions(Insts), EncodingIDs(EncodingIDs), Operands(Ops), FilterBitValues(BW, BitValue::BIT_UNFILTERED), Parent(nullptr), - BestIndex(-1), BitWidth(BW), Emitter(E) { + BitWidth(BW), Emitter(E) { doFilter(); } - FilterChooser(ArrayRef Insts, - ArrayRef IDs, + FilterChooser(ArrayRef Insts, ArrayRef EncodingIDs, const std::map> &Ops, const std::vector &ParentFilterBitValues, const FilterChooser &parent) - : AllInstructions(Insts), Opcodes(IDs), Operands(Ops), - FilterBitValues(ParentFilterBitValues), Parent(&parent), BestIndex(-1), + : AllInstructions(Insts), EncodingIDs(EncodingIDs), Operands(Ops), + FilterBitValues(ParentFilterBitValues), Parent(&parent), BitWidth(parent.BitWidth), Emitter(parent.Emitter) { doFilter(); } @@ -553,27 +564,13 @@ protected: return Insn; } - // Populates the field of the insn given the start position and the number of - // consecutive bits to scan for. - // - // Returns a pair of values (indicator, field), where the indicator is false - // if there exists any uninitialized bit value in the range and true if all - // bits are well-known. The second value is the potentially populated field. - std::pair fieldFromInsn(const insn_t &Insn, unsigned StartBit, - unsigned NumBits) const; - /// dumpFilterArray - dumpFilterArray prints out debugging info for the given /// filter array as a series of chars. void dumpFilterArray(raw_ostream &OS, ArrayRef Filter) const; /// dumpStack - dumpStack traverses the filter chooser chain and calls /// dumpFilterArray on each filter chooser up to the top level one. - void dumpStack(raw_ostream &OS, const char *prefix) const; - - Filter &bestFilter() { - assert(BestIndex != -1 && "BestIndex not set"); - return Filters[BestIndex]; - } + void dumpStack(raw_ostream &OS, indent Indent) const; bool PositionFiltered(unsigned Idx) const { return FilterBitValues[Idx].isSet(); @@ -599,7 +596,7 @@ protected: // Emits table entries to decode the singleton. void emitSingletonTableEntry(DecoderTableInfo &TableInfo, - EncodingIDAndOpcode Opc) const; + unsigned EncodingID) const; // Emits code to decode the singleton, and then to decode the rest. void emitSingletonTableEntry(DecoderTableInfo &TableInfo, @@ -617,8 +614,8 @@ protected: // reportRegion is a helper function for filterProcessor to mark a region as // eligible for use as a filter region. - void reportRegion(bitAttr_t RA, unsigned StartBit, unsigned BitIndex, - bool AllowMixed); + void reportRegion(std::vector> &Filters, bitAttr_t RA, + unsigned StartBit, unsigned BitIndex, bool AllowMixed); // FilterProcessor scans the well-known encoding bits of the instructions and // builds up a list of candidate filters. It chooses the best filter and @@ -647,39 +644,37 @@ public: Filter::Filter(Filter &&f) : Owner(f.Owner), StartBit(f.StartBit), NumBits(f.NumBits), - FilteredInstructions(std::move(f.FilteredInstructions)), - VariableInstructions(std::move(f.VariableInstructions)), + FilteredIDs(std::move(f.FilteredIDs)), + VariableIDs(std::move(f.VariableIDs)), FilterChooserMap(std::move(f.FilterChooserMap)), - NumFiltered(f.NumFiltered), LastOpcFiltered(f.LastOpcFiltered) {} + VariableFC(std::move(f.VariableFC)), NumFiltered(f.NumFiltered) {} Filter::Filter(const FilterChooser &owner, unsigned startBit, unsigned numBits) : Owner(owner), StartBit(startBit), NumBits(numBits) { assert(StartBit + NumBits - 1 < Owner.BitWidth); NumFiltered = 0; - LastOpcFiltered = {0, 0}; - for (const auto &OpcPair : Owner.Opcodes) { + for (unsigned EncodingID : Owner.EncodingIDs) { // Populates the insn given the uid. - insn_t Insn = Owner.insnWithID(OpcPair.EncodingID); + insn_t Insn = Owner.insnWithID(EncodingID); // Scans the segment for possibly well-specified encoding bits. - auto [Ok, Field] = Owner.fieldFromInsn(Insn, StartBit, NumBits); + std::optional Field = fieldFromInsn(Insn, StartBit, NumBits); - if (Ok) { + if (Field) { // The encoding bits are well-known. Lets add the uid of the // instruction into the bucket keyed off the constant field value. - LastOpcFiltered = OpcPair; - FilteredInstructions[Field].push_back(LastOpcFiltered); + FilteredIDs[*Field].push_back(EncodingID); ++NumFiltered; } else { // Some of the encoding bit(s) are unspecified. This contributes to // one additional member of "Variable" instructions. - VariableInstructions.push_back(OpcPair); + VariableIDs.push_back(EncodingID); } } - assert((FilteredInstructions.size() + VariableInstructions.size() > 0) && + assert((FilteredIDs.size() + VariableIDs.size() > 0) && "Filter returns no instruction categories"); } @@ -693,52 +688,41 @@ void Filter::recurse() { // Starts by inheriting our parent filter chooser's filter bit values. std::vector BitValueArray(Owner.FilterBitValues); - if (!VariableInstructions.empty()) { - // Conservatively marks each segment position as BIT_UNSET. + if (!VariableIDs.empty()) { for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex) - BitValueArray[StartBit + bitIndex] = BitValue::BIT_UNSET; + BitValueArray[StartBit + bitIndex] = BitValue::BIT_UNFILTERED; // Delegates to an inferior filter chooser for further processing on this // group of instructions whose segment values are variable. - FilterChooserMap.try_emplace( - NO_FIXED_SEGMENTS_SENTINEL, - std::make_unique(Owner.AllInstructions, - VariableInstructions, Owner.Operands, - BitValueArray, Owner)); + VariableFC = + std::make_unique(Owner.AllInstructions, VariableIDs, + Owner.Operands, BitValueArray, Owner); } // No need to recurse for a singleton filtered instruction. // See also Filter::emit*(). if (getNumFiltered() == 1) { - assert(FilterChooserMap.size() == 1); + assert(VariableFC && "Shouldn't have created a filter for one encoding!"); return; } // Otherwise, create sub choosers. - for (const auto &Inst : FilteredInstructions) { + for (const auto &[FilterVal, EncodingIDs] : FilteredIDs) { // Marks all the segment positions with either BIT_TRUE or BIT_FALSE. for (unsigned bitIndex = 0; bitIndex < NumBits; ++bitIndex) - BitValueArray[StartBit + bitIndex] = Inst.first & (1ULL << bitIndex) + BitValueArray[StartBit + bitIndex] = FilterVal & (1ULL << bitIndex) ? BitValue::BIT_TRUE : BitValue::BIT_FALSE; // Delegates to an inferior filter chooser for further processing on this // category of instructions. FilterChooserMap.try_emplace( - Inst.first, - std::make_unique(Owner.AllInstructions, Inst.second, + FilterVal, + std::make_unique(Owner.AllInstructions, EncodingIDs, Owner.Operands, BitValueArray, Owner)); } } -static void resolveTableFixups(DecoderTable &Table, const FixupList &Fixups, - uint32_t DestIdx) { - // Any NumToSkip fixups in the current scope can resolve to the - // current location. - for (uint32_t FixupIdx : Fixups) - Table.patchNumToSkip(FixupIdx, DestIdx); -} - // Emit table entries to decode instructions given a segment or segments // of bits. void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { @@ -748,52 +732,30 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { TableInfo.Table.insertULEB128(StartBit); TableInfo.Table.push_back(NumBits); - // If the NO_FIXED_SEGMENTS_SENTINEL is present, we need to add a new scope - // for this filter. Otherwise, we can skip adding a new scope and any - // patching added will automatically be added to the enclosing scope. - - // If NO_FIXED_SEGMENTS_SENTINEL is present, it will be last entry in - // FilterChooserMap. - + // If VariableFC is present, we need to add a new scope for this filter. + // Otherwise, we can skip adding a new scope and any patching added will + // automatically be added to the enclosing scope. const uint64_t LastFilter = FilterChooserMap.rbegin()->first; - bool HasFallthrough = LastFilter == NO_FIXED_SEGMENTS_SENTINEL; - if (HasFallthrough) + if (VariableFC) TableInfo.FixupStack.emplace_back(); DecoderTable &Table = TableInfo.Table; size_t PrevFilter = 0; for (const auto &[FilterVal, Delegate] : FilterChooserMap) { - // Field value NO_FIXED_SEGMENTS_SENTINEL implies a non-empty set of - // variable instructions. See also recurse(). - if (FilterVal == NO_FIXED_SEGMENTS_SENTINEL) { - // Each scope should always have at least one filter value to check - // for. - assert(PrevFilter != 0 && "empty filter set!"); - FixupList &CurScope = TableInfo.FixupStack.back(); - // Resolve any NumToSkip fixups in the current scope. - resolveTableFixups(Table, CurScope, Table.size()); - - // Delete the scope we have added here. - TableInfo.FixupStack.pop_back(); - - PrevFilter = 0; // Don't re-process the filter's fallthrough. + // The last filtervalue emitted can be OPC_FilterValue if we are at + // outermost scope. + const uint8_t DecoderOp = + FilterVal == LastFilter && TableInfo.isOutermostScope() + ? MCD::OPC_FilterValueOrFail + : MCD::OPC_FilterValue; + Table.push_back(DecoderOp); + Table.insertULEB128(FilterVal); + if (DecoderOp == MCD::OPC_FilterValue) { + // Reserve space for the NumToSkip entry. We'll backpatch the value later. + PrevFilter = Table.insertNumToSkip(); } else { - // The last filtervalue emitted can be OPC_FilterValue if we are at - // outermost scope. - const uint8_t DecoderOp = - FilterVal == LastFilter && TableInfo.isOutermostScope() - ? MCD::OPC_FilterValueOrFail - : MCD::OPC_FilterValue; - Table.push_back(DecoderOp); - Table.insertULEB128(FilterVal); - if (DecoderOp == MCD::OPC_FilterValue) { - // Reserve space for the NumToSkip entry. We'll backpatch the value - // later. - PrevFilter = Table.insertNumToSkip(); - } else { - PrevFilter = 0; - } + PrevFilter = 0; } // We arrive at a category of instructions with the same segment value. @@ -808,6 +770,16 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { Table.patchNumToSkip(PrevFilter, Table.size()); } + if (VariableFC) { + // Each scope should always have at least one filter value to check for. + assert(PrevFilter != 0 && "empty filter set!"); + TableInfo.popScope(); + PrevFilter = 0; // Don't re-process the filter's fallthrough. + + // Delegate to the sub filter chooser for further decoding. + VariableFC->emitTableEntries(TableInfo); + } + // If there is no fallthrough and the final filter was not in the outermost // scope, then it must be fixed up according to the enclosing scope rather // than the current position. @@ -818,7 +790,7 @@ void Filter::emitTableEntry(DecoderTableInfo &TableInfo) const { // Returns the number of fanout produced by the filter. More fanout implies // the filter distinguishes more categories of instructions. unsigned Filter::usefulness() const { - return FilteredInstructions.size() + VariableInstructions.empty(); + return FilteredIDs.size() + VariableIDs.empty(); } ////////////////////////////////// @@ -832,14 +804,16 @@ unsigned Filter::usefulness() const { unsigned DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table, unsigned BitWidth, StringRef Namespace, - const EncodingIDsVec &EncodingIDs) const { + ArrayRef EncodingIDs) const { // We'll need to be able to map from a decoded opcode into the corresponding // EncodingID for this specific combination of BitWidth and Namespace. This // is used below to index into NumberedEncodings. DenseMap OpcodeToEncodingID; OpcodeToEncodingID.reserve(EncodingIDs.size()); - for (const auto &EI : EncodingIDs) - OpcodeToEncodingID[EI.Opcode] = EI.EncodingID; + for (unsigned EncodingID : EncodingIDs) { + const Record *InstDef = NumberedEncodings[EncodingID].Inst->TheDef; + OpcodeToEncodingID[Target.getInstrIntValue(InstDef)] = EncodingID; + } OS << "static const uint8_t DecoderTable" << Namespace << BitWidth << "[] = {\n"; @@ -1115,28 +1089,6 @@ void DecoderEmitter::emitDecoderFunction(formatted_raw_ostream &OS, OS << "}\n"; } -// Populates the field of the insn given the start position and the number of -// consecutive bits to scan for. -// -// Returns a pair of values (indicator, field), where the indicator is false -// if there exists any uninitialized bit value in the range and true if all -// bits are well-known. The second value is the potentially populated field. -std::pair FilterChooser::fieldFromInsn(const insn_t &Insn, - unsigned StartBit, - unsigned NumBits) const { - uint64_t Field = 0; - - for (unsigned i = 0; i < NumBits; ++i) { - if (Insn[StartBit + i] == BitValue::BIT_UNSET) - return {false, Field}; - - if (Insn[StartBit + i] == BitValue::BIT_TRUE) - Field = Field | (1ULL << i); - } - - return {true, Field}; -} - /// dumpFilterArray - dumpFilterArray prints out debugging info for the given /// filter array as a series of chars. void FilterChooser::dumpFilterArray(raw_ostream &OS, @@ -1147,15 +1099,12 @@ void FilterChooser::dumpFilterArray(raw_ostream &OS, /// dumpStack - dumpStack traverses the filter chooser chain and calls /// dumpFilterArray on each filter chooser up to the top level one. -void FilterChooser::dumpStack(raw_ostream &OS, const char *prefix) const { - const FilterChooser *current = this; - - while (current) { - OS << prefix; - dumpFilterArray(OS, current->FilterBitValues); - OS << '\n'; - current = current->Parent; - } +void FilterChooser::dumpStack(raw_ostream &OS, indent Indent) const { + if (Parent) + Parent->dumpStack(OS, Indent); + OS << Indent; + dumpFilterArray(OS, FilterBitValues); + OS << '\n'; } // Calculates the island(s) needed to decode the instruction. @@ -1452,14 +1401,14 @@ void FilterChooser::emitSoftFailTableEntry(DecoderTableInfo &TableInfo, // Emits table entries to decode the singleton. void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, - EncodingIDAndOpcode Opc) const { - insn_t Insn = insnWithID(Opc.EncodingID); + unsigned EncodingID) const { + insn_t Insn = insnWithID(EncodingID); // Look for islands of undecoded bits of the singleton. std::vector Islands = getIslands(Insn); // Emit the predicate table entry if one is needed. - emitPredicateTableEntry(TableInfo, Opc.EncodingID); + emitPredicateTableEntry(TableInfo, EncodingID); // Check any additional encoding fields needed. for (const Island &Ilnd : reverse(Islands)) { @@ -1484,10 +1433,10 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, } // Check for soft failure of the match. - emitSoftFailTableEntry(TableInfo, Opc.EncodingID); + emitSoftFailTableEntry(TableInfo, EncodingID); auto [DIdx, HasCompleteDecoder] = - getDecoderIndex(TableInfo.Decoders, Opc.EncodingID); + getDecoderIndex(TableInfo.Decoders, EncodingID); // Produce OPC_Decode or OPC_TryDecode opcode based on the information // whether the instruction decoder is complete or not. If it is complete @@ -1504,7 +1453,8 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, : MCD::OPC_TryDecode); TableInfo.Table.push_back(DecoderOp); NumEncodingsSupported++; - TableInfo.Table.insertULEB128(Opc.Opcode); + const Record *InstDef = AllInstructions[EncodingID].Inst->TheDef; + TableInfo.Table.insertULEB128(Emitter->getTarget().getInstrIntValue(InstDef)); TableInfo.Table.insertULEB128(DIdx); if (DecoderOp == MCD::OPC_TryDecode) { @@ -1516,17 +1466,11 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, // Emits table entries to decode the singleton, and then to decode the rest. void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, const Filter &Best) const { - EncodingIDAndOpcode Opc = Best.getSingletonOpc(); - // complex singletons need predicate checks from the first singleton // to refer forward to the variable filterchooser that follows. - TableInfo.FixupStack.emplace_back(); - - emitSingletonTableEntry(TableInfo, Opc); - - resolveTableFixups(TableInfo.Table, TableInfo.FixupStack.back(), - TableInfo.Table.size()); - TableInfo.FixupStack.pop_back(); + TableInfo.pushScope(); + emitSingletonTableEntry(TableInfo, Best.getSingletonEncodingID()); + TableInfo.popScope(); Best.getVariableFC().emitTableEntries(TableInfo); } @@ -1534,18 +1478,18 @@ void FilterChooser::emitSingletonTableEntry(DecoderTableInfo &TableInfo, // Assign a single filter and run with it. Top level API client can initialize // with a single filter to start the filtering process. void FilterChooser::runSingleFilter(unsigned startBit, unsigned numBit) { - Filters.clear(); - Filters.emplace_back(*this, startBit, numBit); - BestIndex = 0; // Sole Filter instance to choose from. - bestFilter().recurse(); + BestFilter = std::make_unique(*this, startBit, numBit); + BestFilter->recurse(); } // reportRegion is a helper function for filterProcessor to mark a region as // eligible for use as a filter region. -void FilterChooser::reportRegion(bitAttr_t RA, unsigned StartBit, +void FilterChooser::reportRegion(std::vector> &Filters, + bitAttr_t RA, unsigned StartBit, unsigned BitIndex, bool AllowMixed) { if (AllowMixed ? RA == ATTR_MIXED : RA == ATTR_ALL_SET) - Filters.emplace_back(*this, StartBit, BitIndex - StartBit); + Filters.push_back( + std::make_unique(*this, StartBit, BitIndex - StartBit)); } // FilterProcessor scans the well-known encoding bits of the instructions and @@ -1553,18 +1497,15 @@ void FilterChooser::reportRegion(bitAttr_t RA, unsigned StartBit, // recursively descends down the decoding tree. bool FilterChooser::filterProcessor(ArrayRef BitAttrs, bool AllowMixed, bool Greedy) { - Filters.clear(); - BestIndex = -1; - - assert(Opcodes.size() >= 2 && "Nothing to filter"); + assert(EncodingIDs.size() >= 2 && "Nothing to filter"); // Heuristics. See also doFilter()'s "Heuristics" comment when num of // instructions is 3. if (AllowMixed && !Greedy) { - assert(Opcodes.size() == 3); + assert(EncodingIDs.size() == 3); - for (const auto &Opcode : Opcodes) { - insn_t Insn = insnWithID(Opcode.EncodingID); + for (unsigned EncodingID : EncodingIDs) { + insn_t Insn = insnWithID(EncodingID); // Look for islands of undecoded bits of any instruction. std::vector Islands = getIslands(Insn); @@ -1599,6 +1540,7 @@ bool FilterChooser::filterProcessor(ArrayRef BitAttrs, bitAttr_t RA = ATTR_NONE; unsigned StartBit = 0; + std::vector> Filters; for (unsigned BitIndex = 0; BitIndex < BitWidth; ++BitIndex) { bitAttr_t bitAttr = BitAttrs[BitIndex]; @@ -1626,17 +1568,17 @@ bool FilterChooser::filterProcessor(ArrayRef BitAttrs, case ATTR_ALL_SET: switch (bitAttr) { case ATTR_FILTERED: - reportRegion(RA, StartBit, BitIndex, AllowMixed); + reportRegion(Filters, RA, StartBit, BitIndex, AllowMixed); RA = ATTR_NONE; break; case ATTR_ALL_SET: break; case ATTR_ALL_UNSET: - reportRegion(RA, StartBit, BitIndex, AllowMixed); + reportRegion(Filters, RA, StartBit, BitIndex, AllowMixed); RA = ATTR_NONE; break; case ATTR_MIXED: - reportRegion(RA, StartBit, BitIndex, AllowMixed); + reportRegion(Filters, RA, StartBit, BitIndex, AllowMixed); StartBit = BitIndex; RA = ATTR_MIXED; break; @@ -1647,17 +1589,17 @@ bool FilterChooser::filterProcessor(ArrayRef BitAttrs, case ATTR_MIXED: switch (bitAttr) { case ATTR_FILTERED: - reportRegion(RA, StartBit, BitIndex, AllowMixed); + reportRegion(Filters, RA, StartBit, BitIndex, AllowMixed); StartBit = BitIndex; RA = ATTR_NONE; break; case ATTR_ALL_SET: - reportRegion(RA, StartBit, BitIndex, AllowMixed); + reportRegion(Filters, RA, StartBit, BitIndex, AllowMixed); StartBit = BitIndex; RA = ATTR_ALL_SET; break; case ATTR_ALL_UNSET: - reportRegion(RA, StartBit, BitIndex, AllowMixed); + reportRegion(Filters, RA, StartBit, BitIndex, AllowMixed); RA = ATTR_NONE; break; case ATTR_MIXED: @@ -1680,23 +1622,23 @@ bool FilterChooser::filterProcessor(ArrayRef BitAttrs, case ATTR_FILTERED: break; case ATTR_ALL_SET: - reportRegion(RA, StartBit, BitWidth, AllowMixed); + reportRegion(Filters, RA, StartBit, BitWidth, AllowMixed); break; case ATTR_ALL_UNSET: break; case ATTR_MIXED: - reportRegion(RA, StartBit, BitWidth, AllowMixed); + reportRegion(Filters, RA, StartBit, BitWidth, AllowMixed); break; } // We have finished with the filter processings. Now it's time to choose // the best performing filter. - BestIndex = 0; + unsigned BestIndex = 0; bool AllUseless = true; unsigned BestScore = 0; for (const auto &[Idx, Filter] : enumerate(Filters)) { - unsigned Usefulness = Filter.usefulness(); + unsigned Usefulness = Filter->usefulness(); if (Usefulness) AllUseless = false; @@ -1707,20 +1649,23 @@ bool FilterChooser::filterProcessor(ArrayRef BitAttrs, } } - if (!AllUseless) - bestFilter().recurse(); + if (AllUseless) + return false; + + BestFilter = std::move(Filters[BestIndex]); + BestFilter->recurse(); + return true; - return !AllUseless; } // end of FilterChooser::filterProcessor(bool) // Decides on the best configuration of filter(s) to use in order to decode // the instructions. A conflict of instructions may occur, in which case we // dump the conflict set to the standard error. void FilterChooser::doFilter() { - assert(!Opcodes.empty() && "FilterChooser created with no instructions"); + assert(!EncodingIDs.empty() && "FilterChooser created with no instructions"); // No filter needed. - if (Opcodes.size() < 2) + if (EncodingIDs.size() < 2) return; // We maintain BIT_WIDTH copies of the bitAttrs automaton. @@ -1748,8 +1693,8 @@ void FilterChooser::doFilter() { if (FilterBitValues[BitIndex].isSet()) BitAttrs[BitIndex] = ATTR_FILTERED; - for (const EncodingIDAndOpcode &OpcPair : Opcodes) { - insn_t EncodingBits = insnWithID(OpcPair.EncodingID); + for (unsigned EncodingID : EncodingIDs) { + insn_t EncodingBits = insnWithID(EncodingID); for (unsigned BitIndex = 0; BitIndex < BitWidth; ++BitIndex) { switch (BitAttrs[BitIndex]) { @@ -1786,51 +1731,47 @@ void FilterChooser::doFilter() { // no single instruction for the maximum ATTR_MIXED region Inst{14-4} has a // well-known encoding pattern. In such case, we backtrack and scan for the // the very first consecutive ATTR_ALL_SET region and assign a filter to it. - if (Opcodes.size() == 3 && + if (EncodingIDs.size() == 3 && filterProcessor(BitAttrs, /*AllowMixed=*/true, /*Greedy=*/false)) return; - // If we come to here, the instruction decoding has failed. - // Set the BestIndex to -1 to indicate so. - BestIndex = -1; + // We don't know how to decode these instructions! Dump the + // conflict set and bail. + assert(!BestFilter); + + // Print out useful conflict information for postmortem analysis. + errs() << "Decoding Conflict:\n"; + + // Dump filters. + indent Indent(4); + dumpStack(errs(), Indent); + + // Dump encodings. + for (unsigned EncodingID : EncodingIDs) { + const EncodingAndInst &Enc = AllInstructions[EncodingID]; + errs() << Indent; + dumpBits(errs(), getBitsField(*Enc.EncodingDef, "Inst"), BitWidth); + errs() << " " << Enc << '\n'; + } + PrintFatalError("Decoding conflict encountered"); } // emitTableEntries - Emit state machine entries to decode our share of // instructions. void FilterChooser::emitTableEntries(DecoderTableInfo &TableInfo) const { - if (Opcodes.size() == 1) { + if (EncodingIDs.size() == 1) { // There is only one instruction in the set, which is great! // Call emitSingletonDecoder() to see whether there are any remaining // encodings bits. - emitSingletonTableEntry(TableInfo, Opcodes[0]); + emitSingletonTableEntry(TableInfo, EncodingIDs[0]); return; } - // Choose the best filter to do the decodings! - if (BestIndex != -1) { - const Filter &Best = Filters[BestIndex]; - if (Best.getNumFiltered() == 1) - emitSingletonTableEntry(TableInfo, Best); - else - Best.emitTableEntry(TableInfo); - return; - } - - // We don't know how to decode these instructions! Dump the - // conflict set and bail. - - // Print out useful conflict information for postmortem analysis. - errs() << "Decoding Conflict:\n"; - - dumpStack(errs(), "\t\t"); - - for (auto Opcode : Opcodes) { - const EncodingAndInst &Enc = AllInstructions[Opcode.EncodingID]; - errs() << '\t' << Enc << ' '; - dumpBits(errs(), getBitsField(*Enc.EncodingDef, "Inst")); - errs() << '\n'; - } - PrintFatalError("Decoding conflict encountered"); + // Use the best filter to do the decoding! + if (BestFilter->getNumFiltered() == 1) + emitSingletonTableEntry(TableInfo, *BestFilter); + else + BestFilter->emitTableEntry(TableInfo); } static std::string findOperandDecoderMethod(const Record *Record) { @@ -2452,17 +2393,17 @@ static void collectHwModesReferencedForEncodings( NamespacesHwModesMap &NamespacesWithHwModes) { SmallBitVector BV(HWM.getNumModeIds()); for (const auto &MS : HWM.getHwModeSelects()) { - for (const HwModeSelect::PairType &P : MS.second.Items) { - if (P.second->isSubClassOf("InstructionEncoding")) { + for (auto [HwModeID, EncodingDef] : MS.second.Items) { + if (EncodingDef->isSubClassOf("InstructionEncoding")) { std::string DecoderNamespace = - P.second->getValueAsString("DecoderNamespace").str(); - if (P.first == DefaultMode) { + EncodingDef->getValueAsString("DecoderNamespace").str(); + if (HwModeID == DefaultMode) { NamespacesWithHwModes[DecoderNamespace].insert(""); } else { NamespacesWithHwModes[DecoderNamespace].insert( - HWM.getMode(P.first).Name); + HWM.getMode(HwModeID).Name); } - BV.set(P.first); + BV.set(HwModeID); } } } @@ -2546,13 +2487,13 @@ namespace { const Record *InstDef = NumberedInstruction->TheDef; if (const Record *RV = InstDef->getValueAsOptionalDef("EncodingInfos")) { EncodingInfoByHwMode EBM(RV, HWM); - for (auto &[ModeId, Encoding] : EBM) { + for (auto [HwModeID, EncodingDef] : EBM) { // DecoderTables with DefaultMode should not have any suffix. - if (ModeId == DefaultMode) { - NumberedEncodings.emplace_back(Encoding, NumberedInstruction, ""); + if (HwModeID == DefaultMode) { + NumberedEncodings.emplace_back(EncodingDef, NumberedInstruction, ""); } else { - NumberedEncodings.emplace_back(Encoding, NumberedInstruction, - HWM.getMode(ModeId).Name); + NumberedEncodings.emplace_back(EncodingDef, NumberedInstruction, + HWM.getMode(HwModeID).Name); } } continue; @@ -2568,8 +2509,8 @@ namespace { NumberedAlias, &Target.getInstruction(NumberedAlias->getValueAsDef("AliasOf"))); - std::map, std::vector> - OpcMap; + // Map of (namespace, size) tuple to encoding IDs. + std::map, std::vector> EncMap; std::map> Operands; std::vector InstrLen; bool IsVarLenInst = Target.hasVariableLengthEncodings(); @@ -2608,8 +2549,7 @@ namespace { EncodingDef->getValueAsString("DecoderNamespace").str(); if (!NumberedEncoding.HwModeName.empty()) DecoderNamespace += "_" + NumberedEncoding.HwModeName.str(); - OpcMap[{DecoderNamespace, Size}].emplace_back( - NEI, Target.getInstrIntValue(Def)); + EncMap[{DecoderNamespace, Size}].push_back(NEI); } else { NumEncodingsOmitted++; } @@ -2617,7 +2557,7 @@ namespace { DecoderTableInfo TableInfo; unsigned OpcodeMask = 0; - for (const auto &[NSAndByteSize, EncodingIDs] : OpcMap) { + for (const auto &[NSAndByteSize, EncodingIDs] : EncMap) { const std::string &DecoderNamespace = NSAndByteSize.first; const unsigned BitWidth = 8 * NSAndByteSize.second; // Emit the decoder for this namespace+width combination. @@ -2628,16 +2568,12 @@ namespace { // predicates and decoders themselves, however, are shared across all // decoders to give more opportunities for uniqueing. TableInfo.Table.clear(); - TableInfo.FixupStack.clear(); - TableInfo.FixupStack.emplace_back(); + TableInfo.pushScope(); FC.emitTableEntries(TableInfo); // Any NumToSkip fixups in the top level scope can resolve to the // OPC_Fail at the end of the table. - assert(TableInfo.FixupStack.size() == 1 && "fixup stack phasing error!"); - // Resolve any NumToSkip fixups in the current scope. - resolveTableFixups(TableInfo.Table, TableInfo.FixupStack.back(), - TableInfo.Table.size()); - TableInfo.FixupStack.clear(); + assert(TableInfo.isOutermostScope() && "fixup stack phasing error!"); + TableInfo.popScope(); TableInfo.Table.push_back(MCD::OPC_Fail); diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn index 0dc5efc981c8..a6848b3c9f24 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/misc/BUILD.gn @@ -46,6 +46,7 @@ static_library("misc") { "NoRecursionCheck.cpp", "NonCopyableObjects.cpp", "NonPrivateMemberVariablesInClassesCheck.cpp", + "OverrideWithDifferentVisibilityCheck.cpp", "RedundantExpressionCheck.cpp", "StaticAssertCheck.cpp", "ThrowByValueCatchByReferenceCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn index 0b6fa7cc5f5c..74b2fe204537 100644 --- a/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/Analysis/FlowSensitive/BUILD.gn @@ -31,6 +31,7 @@ static_library("FlowSensitive") { "DataflowEnvironment.cpp", "DebugSupport.cpp", "Formula.cpp", + "FormulaSerialization.cpp", "HTMLLogger.cpp", "Logger.cpp", "RecordOps.cpp", diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn index e4727d5a3298..1afd342f67ce 100644 --- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn @@ -27,6 +27,7 @@ unittest("ClangAnalysisFlowSensitiveTests") { "DataflowEnvironmentTest.cpp", "DebugSupportTest.cpp", "DeterminismTest.cpp", + "FormulaTest.cpp", "LoggerTest.cpp", "MapLatticeTest.cpp", "MatchSwitchTest.cpp", diff --git a/llvm/utils/gn/secondary/clang/unittests/Lex/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Lex/BUILD.gn index 16abe7a6e95e..a0f72494a2bd 100644 --- a/llvm/utils/gn/secondary/clang/unittests/Lex/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/unittests/Lex/BUILD.gn @@ -20,6 +20,7 @@ unittest("LexTests") { "LexHLSLRootSignatureTest.cpp", "LexerTest.cpp", "ModuleDeclStateTest.cpp", + "NoTrivialPPDirectiveTracerTest.cpp", "PPCallbacksTest.cpp", "PPConditionalDirectiveRecordTest.cpp", "PPDependencyDirectivesTest.cpp", diff --git a/mlir/Maintainers.md b/mlir/Maintainers.md index 667443fe3899..02e93eb65827 100644 --- a/mlir/Maintainers.md +++ b/mlir/Maintainers.md @@ -80,6 +80,40 @@ MLIR components pertaining to egress flows from MLIR, in particular to LLVM IR. [@gysit](https://github.com/gysit) (GitHub), gysit (Discourse) +### Dialects + +The `egress` maintainer refers to the people working in the Egress category, +with the point-of-contact being the maintainers above. Named maintainers, if +available, should be contacted first, as they're more active in those areas. + +#### Lowering Dialects +* ‘llvm’ Dialect (egress) +* ‘SPIR-V’ Dialect ([@kuhar](https://github.com/kuhar), [@antiagainst](https://github.com/antiagainst)) +* ‘emitc’ Dialect ([@aniragil](https://github.com/aniragil), [@marbre](https://github.com/marbre)) + +#### GPU Dialects +* ‘gpu’ Dialect ([@fabianmcg](https://github.com/fabianmcg)) +* ‘amdgpu’ Dialect ([@krzysz00](https://github.com/krzysz00)) +* ‘rocdl’ Dialect ([@krzysz00](https://github.com/krzysz00)) +* ‘nvgpu’ Dialect ([@grypp](https://github.com/grypp)) +* ‘nvvm’ Dialect ([@grypp](https://github.com/grypp)) +* ‘xegpu’ Dialect ([@chencha3](https://github.com/chencha3), [@Jianhui-Li](https://github.com/Jianhui-Li)) +* 'xevm' Dialect ([@silee2](https://github.com/silee2)) + +#### CPU Dialects +* ‘arm_neon’ Dialect ([@banach-space](https://github.com/banach-space)) +* ‘arm_sve’ Dialect ([@banach-space](https://github.com/banach-space)) +* ‘ArmSME’ Dialect ([@banach-space](https://github.com/banach-space)) +* ‘amx’ Dialect ([@adam-smnk](https://github.com/adam-smnk)) +* ‘x86vector’ Dialect ([@adam-smnk](https://github.com/adam-smnk)) +* ‘vcix’ Dialect ([@mshockwave](https://github.com/mshockwave)) + +#### Paradigm Dialects +* ‘omp’ Dialect ([@tblah](https://github.com/tblah), [@skatrak](https://github.com/skatrak)) +* ‘acc’ Dialect ([@clementval](https://github.com/clementval), [@razvanlupusoru](https://github.com/razvanlupusoru)) +* ‘mpi’ Dialect ([@fschlimb](https://github.com/fschlimb)) +* ‘shard’ Dialect ([@fschlimb](https://github.com/fschlimb)) + ## Tensor Compiler MLIR components specific to construction of compilers for tensor algebra, in diff --git a/mlir/cmake/modules/AddMLIRPython.cmake b/mlir/cmake/modules/AddMLIRPython.cmake index c14e614ed7d9..2b883558d33c 100644 --- a/mlir/cmake/modules/AddMLIRPython.cmake +++ b/mlir/cmake/modules/AddMLIRPython.cmake @@ -704,7 +704,12 @@ function(add_mlir_python_extension libname extname) # NanobindAdaptors.h uses PyClassMethod_New to build `pure_subclass`es but nanobind # doesn't declare this API as undefined in its linker flags. So we need to declare it as such # for downstream users that do not do something like `-undefined dynamic_lookup`. - target_link_options(${libname} PUBLIC "LINKER:-U,_PyClassMethod_New") + # Same for the rest. + target_link_options(${libname} PUBLIC + "LINKER:-U,_PyClassMethod_New" + "LINKER:-U,_PyCode_Addr2Location" + "LINKER:-U,_PyFrame_GetLasti" + ) endif() endif() diff --git a/mlir/docs/Dialects/GPU.md b/mlir/docs/Dialects/GPU.md index 94b053daa161..8d4d2ca3e574 100644 --- a/mlir/docs/Dialects/GPU.md +++ b/mlir/docs/Dialects/GPU.md @@ -193,10 +193,25 @@ llvm.func @foo() { // mlir-translate --mlir-to-llvmir: @binary_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8 @binary_func_kernel_name = private unnamed_addr constant [7 x i8] c"func\00", align 1 +@binary_module = internal global ptr null +@llvm.global_ctors = appending global [1 x {i32, ptr, ptr}] [{i32 123, ptr @binary_load, ptr null}] +@llvm.global_dtors = appending global [1 x {i32, ptr, ptr}] [{i32 123, ptr @binary_unload, ptr null}] +define internal void @binary_load() section ".text.startup" { +entry: + %0 = call ptr @mgpuModuleLoad(ptr @binary_bin_cst) + store ptr %0, ptr @binary_module + ... +} +define internal void @binary_unload() section ".text.startup" { +entry: + %0 = load ptr, ptr @binary_module, align 8 + call void @mgpuModuleUnload(ptr %0) + ... +} ... define void @foo() { ... - %module = call ptr @mgpuModuleLoad(ptr @binary_bin_cst) + %module = load ptr, ptr @binary_module, align 8 %kernel = call ptr @mgpuModuleGetFunction(ptr %module, ptr @binary_func_kernel_name) call void @mgpuLaunchKernel(ptr %kernel, ...) ; Launch the kernel ... diff --git a/mlir/docs/Tutorials/Toy/Ch-4.md b/mlir/docs/Tutorials/Toy/Ch-4.md index e9abe36afc4d..621f6a684285 100644 --- a/mlir/docs/Tutorials/Toy/Ch-4.md +++ b/mlir/docs/Tutorials/Toy/Ch-4.md @@ -170,7 +170,7 @@ let arguments = (ins OptionalAttr:$arg_attrs, OptionalAttr:$res_attrs ); - +``` We have already provided the definition in the `extraClassDeclaration` field of the `FuncOp` class: diff --git a/mlir/examples/standalone/CMakeLists.txt b/mlir/examples/standalone/CMakeLists.txt index 038242ba1437..42b487fe2d40 100644 --- a/mlir/examples/standalone/CMakeLists.txt +++ b/mlir/examples/standalone/CMakeLists.txt @@ -8,6 +8,10 @@ set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to") if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) find_package(MLIR REQUIRED CONFIG) + + # Define the default argument to use by `lit` when testing. + set(LLVM_LIT_ARGS "-sv" CACHE STRING "Default options for lit") + message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") diff --git a/mlir/include/mlir-c/ExecutionEngine.h b/mlir/include/mlir-c/ExecutionEngine.h index 99cddc5c2598..1a58d68533f2 100644 --- a/mlir/include/mlir-c/ExecutionEngine.h +++ b/mlir/include/mlir-c/ExecutionEngine.h @@ -46,6 +46,13 @@ MLIR_CAPI_EXPORTED MlirExecutionEngine mlirExecutionEngineCreate( MlirModule op, int optLevel, int numPaths, const MlirStringRef *sharedLibPaths, bool enableObjectDump); +/// Initialize the ExecutionEngine. Global constructors specified by +/// `llvm.mlir.global_ctors` will be run. One common scenario is that kernel +/// binary compiled from `gpu.module` gets loaded during initialization. Make +/// sure all symbols are resolvable before initialization by calling +/// `mlirExecutionEngineRegisterSymbol` or including shared libraries. +MLIR_CAPI_EXPORTED void mlirExecutionEngineInitialize(MlirExecutionEngine jit); + /// Destroy an ExecutionEngine instance. MLIR_CAPI_EXPORTED void mlirExecutionEngineDestroy(MlirExecutionEngine jit); diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h index 335a2dddc756..e6d2f8dcca7d 100644 --- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h +++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h @@ -479,10 +479,12 @@ public: /// respect to a positive constant `divisor`. Two constraints are added to the /// system to capture equivalence with the floordiv: /// q = dividend floordiv c <=> c*q <= dividend <= c*q + c - 1. - void addLocalFloorDiv(ArrayRef dividend, - const DynamicAPInt &divisor); - void addLocalFloorDiv(ArrayRef dividend, int64_t divisor) { - addLocalFloorDiv(getDynamicAPIntVec(dividend), DynamicAPInt(divisor)); + /// Returns the column position of the new local variable. + unsigned addLocalFloorDiv(ArrayRef dividend, + const DynamicAPInt &divisor); + unsigned addLocalFloorDiv(ArrayRef dividend, int64_t divisor) { + return addLocalFloorDiv(getDynamicAPIntVec(dividend), + DynamicAPInt(divisor)); } /// Adds a new local variable as the modulus of an affine function of other diff --git a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h index 38b5e492a8ed..2096bcb9896a 100644 --- a/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h +++ b/mlir/include/mlir/Conversion/LLVMCommon/TypeConverter.h @@ -74,8 +74,14 @@ public: /// LLVM-compatible type. In particular, if more than one value is returned, /// create an LLVM dialect structure type with elements that correspond to /// each of the types converted with `convertCallingConventionType`. - Type packFunctionResults(TypeRange types, - bool useBarePointerCallConv = false) const; + /// + /// Populate the converted (unpacked) types into `groupedTypes`, if provided. + /// `groupedType` contains one nested vector per input type. In case of a 1:N + /// conversion, a nested vector may contain 0 or more then 1 converted type. + Type + packFunctionResults(TypeRange types, bool useBarePointerCallConv = false, + SmallVector> *groupedTypes = nullptr, + int64_t *numConvertedTypes = nullptr) const; /// Convert a non-empty list of types of values produced by an operation into /// an LLVM-compatible type. In particular, if more than one value is @@ -88,15 +94,9 @@ public: /// UnrankedMemRefType, are converted following the specific rules for the /// calling convention. Calling convention independent types are converted /// following the default LLVM type conversions. - Type convertCallingConventionType(Type type, - bool useBarePointerCallConv = false) const; - - /// Promote the bare pointers in 'values' that resulted from memrefs to - /// descriptors. 'stdTypes' holds the types of 'values' before the conversion - /// to the LLVM-IR dialect (i.e., MemRefType, or any other builtin type). - void promoteBarePtrsToDescriptors(ConversionPatternRewriter &rewriter, - Location loc, ArrayRef stdTypes, - SmallVectorImpl &values) const; + LogicalResult + convertCallingConventionType(Type type, SmallVectorImpl &result, + bool useBarePointerCallConv = false) const; /// Returns the MLIR context. MLIRContext &getContext() const; @@ -109,9 +109,14 @@ public: /// Promote the LLVM representation of all operands including promoting MemRef /// descriptors to stack and use pointers to struct to avoid the complexity /// of the platform-specific C/C++ ABI lowering related to struct argument - /// passing. + /// passing. (The ArrayRef variant is for 1:N.) SmallVector promoteOperands(Location loc, ValueRange opOperands, - ValueRange operands, OpBuilder &builder, + ArrayRef adaptorOperands, + OpBuilder &builder, + bool useBarePtrCallConv = false) const; + SmallVector promoteOperands(Location loc, ValueRange opOperands, + ValueRange adaptorOperands, + OpBuilder &builder, bool useBarePtrCallConv = false) const; /// Promote the LLVM struct representation of one MemRef descriptor to stack diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index be114ea4fb63..c956d69781b3 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -2115,4 +2115,98 @@ def AllocateDirOp : OpenMP_Op<"allocate_dir", clauses = [ let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// TargetAllocMemOp +//===----------------------------------------------------------------------===// + +def TargetAllocMemOp : OpenMP_Op<"target_allocmem", + [MemoryEffects<[MemAlloc]>, AttrSizedOperandSegments]> { + let summary = "allocate storage on an openmp device for an object of a given type"; + + let description = [{ + Allocates memory on the specified OpenMP device for an object of the given type. + Returns an integer value representing the device pointer to the allocated memory. + The memory is uninitialized after allocation. Operations must be paired with + `omp.target_freemem` to avoid memory leaks. + + * `$device`: The integer ID of the OpenMP device where the memory will be allocated. + * `$in_type`: The type of the object for which memory is being allocated. + For arrays, this can be a static or dynamic array type. + * `$uniq_name`: An optional unique name for the allocated memory. + * `$bindc_name`: An optional name used for C interoperability. + * `$typeparams`: Runtime type parameters for polymorphic or parameterized types. + These are typically integer values that define aspects of a type not fixed at compile time. + * `$shape`: Runtime shape operands for dynamic arrays. + Each operand is an integer value representing the extent of a specific dimension. + + ```mlir + // Allocate a static 3x3 integer vector on device 0 + %device_0 = arith.constant 0 : i32 + %ptr_static = omp.target_allocmem %device_0 : i32, vector<3x3xi32> + // ... use %ptr_static ... + omp.target_freemem %device_0, %ptr_static : i32, i64 + + // Allocate a dynamic 2D Fortran array (fir.array) on device 1 + %device_1 = arith.constant 1 : i32 + %rows = arith.constant 10 : index + %cols = arith.constant 20 : index + %ptr_dynamic = omp.target_allocmem %device_1 : i32, !fir.array, %rows, %cols : index, index + // ... use %ptr_dynamic ... + omp.target_freemem %device_1, %ptr_dynamic : i32, i64 + ``` + }]; + + let arguments = (ins + Arg:$device, + TypeAttr:$in_type, + OptionalAttr:$uniq_name, + OptionalAttr:$bindc_name, + Variadic:$typeparams, + Variadic:$shape + ); + let results = (outs I64); + + let hasCustomAssemblyFormat = 1; + let hasVerifier = 1; + + let extraClassDeclaration = [{ + mlir::Type getAllocatedType(); + }]; +} + +//===----------------------------------------------------------------------===// +// TargetFreeMemOp +//===----------------------------------------------------------------------===// + +def TargetFreeMemOp : OpenMP_Op<"target_freemem", + [MemoryEffects<[MemFree]>]> { + let summary = "free memory on an openmp device"; + + let description = [{ + Deallocates memory on the specified OpenMP device that was previously + allocated by an `omp.target_allocmem` operation. After this operation, the + deallocated memory is in an undefined state and should not be accessed. + It is crucial to ensure that all accesses to the memory region are completed + before `omp.target_freemem` is called to avoid undefined behavior. + + * `$device`: The integer ID of the OpenMP device from which the memory will be freed. + * `$heapref`: The integer value representing the device pointer to the memory + to be deallocated, which was previously returned by `omp.target_allocmem`. + + ```mlir + // Example of allocating and freeing memory on an OpenMP device + %device_id = arith.constant 0 : i32 + %allocated_ptr = omp.target_allocmem %device_id : i32, vector<3x3xi32> + // ... operations using %allocated_ptr on the device ... + omp.target_freemem %device_id, %allocated_ptr : i32, i64 + ``` + }]; + + let arguments = (ins + Arg:$device, + Arg:$heapref + ); + let assemblyFormat = "$device `,` $heapref attr-dict `:` type($device) `,` qualified(type($heapref))"; +} + #endif // OPENMP_OPS diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td index 1f420c13ebae..a94987885c9e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td @@ -527,4 +527,34 @@ def XeGPU_RangeAttr : XeGPUAttr<"Range", "range"> { let genVerifyDecl = 1; } +def XeGPU_MemLayoutAttr : XeGPUAttr<"MemLayout", "mem_layout"> { + let summary = [{Specifies memory layouts with named attributes.}]; + + let description = [{ + This attribute stores a collection of named attributes that describe + memory layout properties such as stride, block, etc. + }]; + + let parameters = (ins "DictionaryAttr": $attrs); + let hasCustomAssemblyFormat = 1; + + let extraClassDeclaration = [{ + /// Get a specific attribute by name + Attribute getAttr(StringRef name) const { + return getAttrs().get(name); + } + + /// Check if a specific attribute exists + bool hasAttr(StringRef name) const { + return getAttrs().contains(name); + } + + ArrayAttr getStrides() { + return getAttrs().getAs("stride"); + } + + }]; + +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index 480b43e74073..abc291c81a76 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -1097,4 +1097,152 @@ def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["sou let hasCanonicalizer = 1; } +def isSharedPred : CPred<"isSharedMemory(llvm::cast($_self))">; +class StaticShared1DMemRefOf allowedTypes> : + ConfinedType, [HasStaticShapePred, isSharedPred], + "statically shaped " # MemRefOf.summary # " for shared memory", + "mlir::MemRefType">; + +class SizeInBits : + StrFunc<"llvm::cast($" # name # ".getType()).getNumElements()" + "*llvm::cast($" # name # ".getType()).getElementTypeBitWidth()">; +class AllMemSizesMatch names> : + AllMatchSameOperatorTrait.result, + "size in bits">; + +def XeGPU_CreateMemDescOp: XeGPU_Op<"create_mem_desc", [Pure, + AllMemSizesMatch<["source", "mem_desc"]>]> { + let summary = "Create a memory descriptor."; + let description = [{ + Creates a memory descriptor from a shared local memory (SLM) buffer, and xegpu + specific memory layout. The resulting memory descriptor has to have the same size + as the underlying shared local memory. + + Arguments: + - `source` : a 1D statically shaped memref with element type i8, representing the raw SLM buffer. + Results: + - `mem_desc` : the memory descriptor. + }]; + let arguments = (ins StaticShared1DMemRefOf<[I8]>:$source); + let results = (outs XeGPU_MemDesc:$mem_desc); + let assemblyFormat = "$source prop-dict attr-dict `` `:` type($source) `->` qualified(type($mem_desc))"; +} + +def XeGPU_LoadMatrixOp: XeGPU_Op<"load_matrix", [MemoryEffects<[MemRead]>, + AllElementTypesMatch<["mem_desc", "res"]>, + AllRanksMatch<["mem_desc", "res"]>]> { + let arguments = (ins XeGPU_MemDesc:$mem_desc, + Variadic: $offsets, + DenseI64ArrayAttr: $const_offsets, + OptionalAttr:$layout + ); + let results = (outs XeGPU_ValueType:$res); + let assemblyFormat = [{ + $mem_desc `` custom($offsets, $const_offsets) + prop-dict attr-dict `` `:` type(operands) `->` type(results) + }]; + + let description = [{ + This operation loads a 2D block of data from shared local memory (SLM) as specified + by the provided 2D `mem_desc`. Only 2D memory descriptors are supported; use the + subview operation to obtain a compatible 2D `mem_desc` from a higher-rank descriptor if needed. + + Arguments: + - `mem_desc`: the memory descriptor identifying the SLM region. + - `offsets`: the coordinates within the matrix to read from. + - `layout`: [optional] An attribute for guiding distributions among + subgroups and/or work-items. It currently can accept either + LayoutAttr or SliceAttr. + Results: + - `res`: the matrix elements loaded from SLM. + }]; + + let builders = [ + OpBuilder<(ins "Type":$res, "TypedValue": $mem_desc, + "llvm::ArrayRef": $offsets, "LayoutTrait": $layout)>, + ]; + let extraClassDeclaration = [{ + SmallVector getMixedOffsets() { + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); + } + }]; + + let hasVerifier = 1; +} + +def XeGPU_StoreMatrixOp: XeGPU_Op<"store_matrix", [MemoryEffects<[MemWrite]>, + AllElementTypesMatch<["mem_desc", "data"]>, + AllRanksMatch<["mem_desc", "data"]>]> { + let arguments = (ins + XeGPU_ValueType:$data, + XeGPU_MemDesc:$mem_desc, + Variadic: $offsets, + DenseI64ArrayAttr: $const_offsets, + OptionalAttr:$layout + ); + let assemblyFormat = [{ $data `,` $mem_desc `` custom($offsets, $const_offsets) + prop-dict attr-dict `` `:` type(operands)}]; + let description = [{ + This operation stores a 2D `data` fragment into the shared local memory region + specified by a 2D `mem_desc`. Only 2D memory descriptors are supported; use the + subview operation to obtain a 2D `mem_desc` from a higher-rank descriptor if needed. + + Arguments: + - `mem_desc`: the memory descriptor specifying the SLM region. + - `offsets`: the coordinates within the matrix where the data will be written. + - `data`: the values to be stored in the matrix. + - `layout`: [optional] An attribute for guiding distributions among + subgroups and/or work-items. It currently can accept either + LayoutAttr or SliceAttr. + }]; + let builders = [ + OpBuilder<(ins "Value" : $data, "TypedValue": $mem_desc, + "llvm::ArrayRef": $offsets, "LayoutTrait": $layout)>, + ]; + let extraClassDeclaration = [{ + SmallVector getMixedOffsets() { + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); + } + }]; + + let hasVerifier = 1; +} + +def XeGPU_MemDescSubviewOp: XeGPU_Op<"mem_desc_subview", + [Pure, ViewLikeOpInterface, AllElementTypesMatch<["src", "res"]>]> { + let description = [{ + Creates a subview of a memory descriptor. The resulting memory descriptor can have + a lower rank than the source; in this case, the result dimensions correspond to the + higher-order dimensions of the source memory descriptor. + + Arguments: + - `src` : a memory descriptor. + - `offsets` : the coordinates within the matrix the subview will be created from. + + Results: + - `res` : a memory descriptor with smaller size. + + }]; + let arguments = (ins XeGPU_MemDesc:$src, + Variadic:$offsets, + DenseI64ArrayAttr:$const_offsets); + let results = (outs XeGPU_MemDesc:$res); + let assemblyFormat = [{$src `` custom($offsets, $const_offsets) prop-dict + attr-dict `` `:` qualified(type($src)) `->` qualified(type($res))}]; + let builders = [ + OpBuilder<(ins "Type": $res, "Value":$src, "llvm::ArrayRef": $offsets)> + ]; + + let extraClassDeclaration = [{ + mlir::Value getViewSource() { return getSrc(); } + + SmallVector getMixedOffsets() { + return getMixedValues(getConstOffsets(), getOffsets(), getContext()); + } + }]; + + let hasVerifier = 1; +} + + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index b268cabb5d26..f8b371db498e 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -201,4 +201,53 @@ def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> { }]; } +def XeGPU_MemDesc: XeGPUTypeDef<"MemDesc", "mem_desc", [ShapedTypeInterface], "mlir::Type"> { + let summary = "MemDesc describing the data in SLM"; + let description = [{ + MemDesc represents a block of data stored in shared local memory. + By default, unless a layout attribute is provided, the data is stored + contiguously in row-major order within the region. + + Examples: + ```mlir + // A multi-dimensional array stored in column-major order. + !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout> + + // A multi-dimensional array stored in a blocked layout. Elements within the same block + // are stored contiguously in memory. Blocks are stored in row-major order. + !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout> + + // A multi-dimensional array stored in column-major order with blocked layout. + !xegpu.mem_desc<128x128xf16, #xegpu.mem_layout> + ``` + }]; + let parameters = (ins ArrayRefParameter<"int64_t">: $shape, + "mlir::Type": $elementType, + OptionalParameter<"MemLayoutAttr">: $mem_layout); + + let extraClassDeclaration = [{ + bool hasRank() const { return true; } + + MemDescType cloneWith(std::optional> shape, Type elementType) const { + return MemDescType::get(getContext(), shape.value_or(getShape()), elementType, getMemLayout()); + } + + ArrayAttr getStrides() { + auto layout = getMemLayout(); + if (layout && layout.hasAttr("stride")) { + return layout.getStrides(); + } + + // derive and return default strides + SmallVector defaultStrides; + llvm::append_range(defaultStrides, getShape().drop_front()); + llvm::append_values(defaultStrides, 1); + Builder builder(getContext()); + return builder.getI64ArrayAttr(defaultStrides); + } + }]; + + let hasCustomAssemblyFormat = true; +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD diff --git a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h index 96ccebcd5685..5bd71d68d253 100644 --- a/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h +++ b/mlir/include/mlir/ExecutionEngine/ExecutionEngine.h @@ -227,6 +227,13 @@ public: llvm::function_ref symbolMap); + /// Initialize the ExecutionEngine. Global constructors specified by + /// `llvm.mlir.global_ctors` will be run. One common scenario is that kernel + /// binary compiled from `gpu.module` gets loaded during initialization. Make + /// sure all symbols are resolvable before initialization by calling + /// `registerSymbols` or including shared libraries. + void initialize(); + private: /// Ordering of llvmContext and jit is important for destruction purposes: the /// jit must be destroyed before the context. @@ -250,6 +257,8 @@ private: /// Destroy functions in the libraries loaded by the ExecutionEngine that are /// called when this ExecutionEngine is destructed. SmallVector destroyFns; + + bool isInitialized = false; }; } // namespace mlir diff --git a/mlir/include/mlir/Transforms/DialectConversion.h b/mlir/include/mlir/Transforms/DialectConversion.h index 220431e6ee2f..536b23f5c33c 100644 --- a/mlir/include/mlir/Transforms/DialectConversion.h +++ b/mlir/include/mlir/Transforms/DialectConversion.h @@ -1300,6 +1300,12 @@ struct ConversionConfig { /// The folding mode to use during conversion. DialectConversionFoldingMode foldingMode = DialectConversionFoldingMode::BeforePatterns; + + /// If set to "true", the materialization kind ("source" or "target") will be + /// attached to "builtin.unrealized_conversion_cast" ops. This flag is useful + /// for debugging, to find out what kind of materialization rule may be + /// missing. + bool attachDebugMaterializationKind = false; }; //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp index f4b02b496a5c..30ce1fb32001 100644 --- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp +++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp @@ -60,7 +60,7 @@ private: AffineExpr localExpr) override { SimpleAffineExprFlattener::addLocalFloorDivId(dividend, divisor, localExpr); // Update localVarCst. - localVarCst.addLocalFloorDiv(dividend, divisor); + (void)localVarCst.addLocalFloorDiv(dividend, divisor); } LogicalResult addLocalIdSemiAffine(ArrayRef lhs, diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp index 1d1e4ded19db..0dcdd5bb97bc 100644 --- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp +++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp @@ -1500,12 +1500,13 @@ void IntegerRelation::addBound(BoundType type, ArrayRef expr, /// respect to a positive constant 'divisor'. Two constraints are added to the /// system to capture equivalence with the floordiv. /// q = expr floordiv c <=> c*q <= expr <= c*q + c - 1. -void IntegerRelation::addLocalFloorDiv(ArrayRef dividend, - const DynamicAPInt &divisor) { +/// Returns the column position of the new local variable. +unsigned IntegerRelation::addLocalFloorDiv(ArrayRef dividend, + const DynamicAPInt &divisor) { assert(dividend.size() == getNumCols() && "incorrect dividend size"); assert(divisor > 0 && "positive divisor expected"); - appendVar(VarKind::Local); + unsigned newVar = appendVar(VarKind::Local); SmallVector dividendCopy(dividend); dividendCopy.insert(dividendCopy.end() - 1, DynamicAPInt(0)); @@ -1513,6 +1514,7 @@ void IntegerRelation::addLocalFloorDiv(ArrayRef dividend, getDivLowerBound(dividendCopy, divisor, dividendCopy.size() - 2)); addInequality( getDivUpperBound(dividendCopy, divisor, dividendCopy.size() - 2)); + return newVar; } unsigned IntegerRelation::addLocalModulo(ArrayRef exprs, diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp index 08290db55f2c..51e2007db45e 100644 --- a/mlir/lib/Analysis/Presburger/Simplex.cpp +++ b/mlir/lib/Analysis/Presburger/Simplex.cpp @@ -433,7 +433,7 @@ LogicalResult SymbolicLexSimplex::addSymbolicCut(unsigned row) { normalizeDiv(divCoeffs, divDenom); domainSimplex.addDivisionVariable(divCoeffs, divDenom); - domainPoly.addLocalFloorDiv(divCoeffs, divDenom); + (void)domainPoly.addLocalFloorDiv(divCoeffs, divDenom); // Update `this` to account for the additional symbol we just added. appendSymbol(); diff --git a/mlir/lib/Bindings/Python/ExecutionEngineModule.cpp b/mlir/lib/Bindings/Python/ExecutionEngineModule.cpp index 81dada355362..4885d62c56e6 100644 --- a/mlir/lib/Bindings/Python/ExecutionEngineModule.cpp +++ b/mlir/lib/Bindings/Python/ExecutionEngineModule.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "mlir-c/ExecutionEngine.h" -#include "mlir/Bindings/Python/NanobindAdaptors.h" #include "mlir/Bindings/Python/Nanobind.h" +#include "mlir/Bindings/Python/NanobindAdaptors.h" namespace nb = nanobind; using namespace mlir; @@ -124,6 +124,17 @@ NB_MODULE(_mlirExecutionEngine, m) { }, nb::arg("name"), nb::arg("callback"), "Register `callback` as the runtime symbol `name`.") + .def( + "initialize", + [](PyExecutionEngine &executionEngine) { + mlirExecutionEngineInitialize(executionEngine.get()); + }, + "Initialize the ExecutionEngine. Global constructors specified by " + "`llvm.mlir.global_ctors` will be run. One common scenario is that " + "kernel binary compiled from `gpu.module` gets loaded during " + "initialization. Make sure all symbols are resolvable before " + "initialization by calling `register_runtime` or including " + "shared libraries.") .def( "dump_to_object_file", [](PyExecutionEngine &executionEngine, const std::string &fileName) { diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 390cdc5429be..4b3a06cbce85 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -2810,7 +2810,7 @@ private: // bpo-42262 added Py_XNewRef() #if !defined(Py_XNewRef) -PyObject *_Py_XNewRef(PyObject *obj) { +[[maybe_unused]] PyObject *_Py_XNewRef(PyObject *obj) { Py_XINCREF(obj); return obj; } @@ -2819,7 +2819,7 @@ PyObject *_Py_XNewRef(PyObject *obj) { // bpo-42262 added Py_NewRef() #if !defined(Py_NewRef) -PyObject *_Py_NewRef(PyObject *obj) { +[[maybe_unused]] PyObject *_Py_NewRef(PyObject *obj) { Py_INCREF(obj); return obj; } diff --git a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp index 306cebd236be..2dbb993b1640 100644 --- a/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp +++ b/mlir/lib/CAPI/ExecutionEngine/ExecutionEngine.cpp @@ -68,6 +68,10 @@ mlirExecutionEngineCreate(MlirModule op, int optLevel, int numPaths, return wrap(jitOrError->release()); } +extern "C" void mlirExecutionEngineInitialize(MlirExecutionEngine jit) { + unwrap(jit)->initialize(); +} + extern "C" void mlirExecutionEngineDestroy(MlirExecutionEngine jit) { delete (unwrap(jit)); } @@ -106,9 +110,8 @@ extern "C" void mlirExecutionEngineRegisterSymbol(MlirExecutionEngine jit, void *sym) { unwrap(jit)->registerSymbols([&](llvm::orc::MangleAndInterner interner) { llvm::orc::SymbolMap symbolMap; - symbolMap[interner(unwrap(name))] = - { llvm::orc::ExecutorAddr::fromPtr(sym), - llvm::JITSymbolFlags::Exported }; + symbolMap[interner(unwrap(name))] = {llvm::orc::ExecutorAddr::fromPtr(sym), + llvm::JITSymbolFlags::Exported}; return symbolMap; }); } diff --git a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp index 18e857c81af8..cb0c82971956 100644 --- a/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp +++ b/mlir/lib/Conversion/ArithToLLVM/ArithToLLVM.cpp @@ -238,6 +238,16 @@ struct CmpFOpLowering : public ConvertOpToLLVMPattern { ConversionPatternRewriter &rewriter) const override; }; +struct SelectOpOneToNLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using Adaptor = + typename ConvertOpToLLVMPattern::OneToNOpAdaptor; + + LogicalResult + matchAndRewrite(arith::SelectOp op, Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + } // namespace //===----------------------------------------------------------------------===// @@ -479,6 +489,32 @@ CmpFOpLowering::matchAndRewrite(arith::CmpFOp op, OpAdaptor adaptor, rewriter); } +//===----------------------------------------------------------------------===// +// SelectOpOneToNLowering +//===----------------------------------------------------------------------===// + +/// Pattern for arith.select where the true/false values lower to multiple +/// SSA values (1:N conversion). This pattern generates multiple arith.select +/// than can be lowered by the 1:1 arith.select pattern. +LogicalResult SelectOpOneToNLowering::matchAndRewrite( + arith::SelectOp op, Adaptor adaptor, + ConversionPatternRewriter &rewriter) const { + // In case of a 1:1 conversion, the 1:1 pattern will match. + if (llvm::hasSingleElement(adaptor.getTrueValue())) + return rewriter.notifyMatchFailure( + op, "not a 1:N conversion, 1:1 pattern will match"); + if (!op.getCondition().getType().isInteger(1)) + return rewriter.notifyMatchFailure(op, + "non-i1 conditions are not supported"); + SmallVector results; + for (auto [trueValue, falseValue] : + llvm::zip_equal(adaptor.getTrueValue(), adaptor.getFalseValue())) + results.push_back(arith::SelectOp::create( + rewriter, op.getLoc(), op.getCondition(), trueValue, falseValue)); + rewriter.replaceOpWithMultiple(op, {results}); + return success(); +} + //===----------------------------------------------------------------------===// // Pass Definition //===----------------------------------------------------------------------===// @@ -587,6 +623,7 @@ void mlir::arith::populateArithToLLVMConversionPatterns( RemSIOpLowering, RemUIOpLowering, SelectOpLowering, + SelectOpOneToNLowering, ShLIOpLowering, ShRSIOpLowering, ShRUIOpLowering, diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp index ff6d36917639..798d8b04eed7 100644 --- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp +++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp @@ -125,22 +125,33 @@ static FailureOr getConvertedBlock(ConversionPatternRewriter &rewriter, return rewriter.applySignatureConversion(block, *conversion, converter); } +/// Flatten the given value ranges into a single vector of values. +static SmallVector flattenValues(ArrayRef values) { + SmallVector result; + for (const ValueRange &vals : values) + llvm::append_range(result, vals); + return result; +} + /// Convert the destination block signature (if necessary) and lower the branch /// op to llvm.br. struct BranchOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using Adaptor = + typename ConvertOpToLLVMPattern::OneToNOpAdaptor; LogicalResult - matchAndRewrite(cf::BranchOp op, typename cf::BranchOp::Adaptor adaptor, + matchAndRewrite(cf::BranchOp op, Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { + SmallVector flattenedAdaptor = flattenValues(adaptor.getOperands()); FailureOr convertedBlock = getConvertedBlock(rewriter, getTypeConverter(), op, op.getSuccessor(), - TypeRange(adaptor.getOperands())); + TypeRange(ValueRange(flattenedAdaptor))); if (failed(convertedBlock)) return failure(); DictionaryAttr attrs = op->getAttrDictionary(); Operation *newOp = rewriter.replaceOpWithNewOp( - op, adaptor.getOperands(), *convertedBlock); + op, flattenedAdaptor, *convertedBlock); // TODO: We should not just forward all attributes like that. But there are // existing Flang tests that depend on this behavior. newOp->setAttrs(attrs); @@ -152,29 +163,37 @@ struct BranchOpLowering : public ConvertOpToLLVMPattern { /// branch op to llvm.cond_br. struct CondBranchOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + using Adaptor = + typename ConvertOpToLLVMPattern::OneToNOpAdaptor; LogicalResult - matchAndRewrite(cf::CondBranchOp op, - typename cf::CondBranchOp::Adaptor adaptor, + matchAndRewrite(cf::CondBranchOp op, Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { + SmallVector flattenedAdaptorTrue = + flattenValues(adaptor.getTrueDestOperands()); + SmallVector flattenedAdaptorFalse = + flattenValues(adaptor.getFalseDestOperands()); + if (!llvm::hasSingleElement(adaptor.getCondition())) + return rewriter.notifyMatchFailure(op, + "expected single element condition"); FailureOr convertedTrueBlock = getConvertedBlock(rewriter, getTypeConverter(), op, op.getTrueDest(), - TypeRange(adaptor.getTrueDestOperands())); + TypeRange(ValueRange(flattenedAdaptorTrue))); if (failed(convertedTrueBlock)) return failure(); FailureOr convertedFalseBlock = getConvertedBlock(rewriter, getTypeConverter(), op, op.getFalseDest(), - TypeRange(adaptor.getFalseDestOperands())); + TypeRange(ValueRange(flattenedAdaptorFalse))); if (failed(convertedFalseBlock)) return failure(); - DictionaryAttr attrs = op->getAttrDictionary(); + DictionaryAttr attrs = op->getDiscardableAttrDictionary(); auto newOp = rewriter.replaceOpWithNewOp( - op, adaptor.getCondition(), adaptor.getTrueDestOperands(), - adaptor.getFalseDestOperands(), op.getBranchWeightsAttr(), + op, llvm::getSingleElement(adaptor.getCondition()), + flattenedAdaptorTrue, flattenedAdaptorFalse, op.getBranchWeightsAttr(), *convertedTrueBlock, *convertedFalseBlock); // TODO: We should not just forward all attributes like that. But there are // existing Flang tests that depend on this behavior. - newOp->setAttrs(attrs); + newOp->setDiscardableAttrs(attrs); return success(); } }; diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp index a4a6ae250640..42c76ed475b4 100644 --- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp +++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp @@ -527,19 +527,21 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; using Super = CallOpInterfaceLowering; using Base = ConvertOpToLLVMPattern; + using Adaptor = typename ConvertOpToLLVMPattern::OneToNOpAdaptor; - LogicalResult matchAndRewriteImpl(CallOpType callOp, - typename CallOpType::Adaptor adaptor, + LogicalResult matchAndRewriteImpl(CallOpType callOp, Adaptor adaptor, ConversionPatternRewriter &rewriter, bool useBarePtrCallConv = false) const { // Pack the result types into a struct. Type packedResult = nullptr; + SmallVector> groupedResultTypes; unsigned numResults = callOp.getNumResults(); auto resultTypes = llvm::to_vector<4>(callOp.getResultTypes()); - + int64_t numConvertedTypes = 0; if (numResults != 0) { if (!(packedResult = this->getTypeConverter()->packFunctionResults( - resultTypes, useBarePtrCallConv))) + resultTypes, useBarePtrCallConv, &groupedResultTypes, + &numConvertedTypes))) return failure(); } @@ -565,34 +567,64 @@ struct CallOpInterfaceLowering : public ConvertOpToLLVMPattern { static_cast(promoted.size()), 0}; newOp.getProperties().op_bundle_sizes = rewriter.getDenseI32ArrayAttr({}); - SmallVector results; - if (numResults < 2) { - // If < 2 results, packing did not do anything and we can just return. - results.append(newOp.result_begin(), newOp.result_end()); - } else { - // Otherwise, it had been converted to an operation producing a structure. - // Extract individual results from the structure and return them as list. - results.reserve(numResults); - for (unsigned i = 0; i < numResults; ++i) { - results.push_back(LLVM::ExtractValueOp::create( - rewriter, callOp.getLoc(), newOp->getResult(0), i)); + // Helper function that extracts an individual result from the return value + // of the new call op. llvm.call ops support only 0 or 1 result. In case of + // 2 or more results, the results are packed into a structure. + // + // The new call op may have more than 2 results because: + // a. The original call op has more than 2 results. + // b. An original op result type-converted to more than 1 result. + auto getUnpackedResult = [&](unsigned i) -> Value { + assert(numConvertedTypes > 0 && "convert op has no results"); + if (numConvertedTypes == 1) { + assert(i == 0 && "out of bounds: converted op has only one result"); + return newOp->getResult(0); + } + // Results have been converted to a structure. Extract individual results + // from the structure. + return LLVM::ExtractValueOp::create(rewriter, callOp.getLoc(), + newOp->getResult(0), i); + }; + + // Group the results into a vector of vectors, such that it is clear which + // original op result is replaced with which range of values. (In case of a + // 1:N conversion, there can be multiple replacements for a single result.) + SmallVector> results; + results.reserve(numResults); + unsigned counter = 0; + for (unsigned i = 0; i < numResults; ++i) { + SmallVector &group = results.emplace_back(); + for (unsigned j = 0, e = groupedResultTypes[i].size(); j < e; ++j) + group.push_back(getUnpackedResult(counter++)); + } + + // Special handling for MemRef types. + for (unsigned i = 0; i < numResults; ++i) { + Type origType = resultTypes[i]; + auto memrefType = dyn_cast(origType); + auto unrankedMemrefType = dyn_cast(origType); + if (useBarePtrCallConv && memrefType) { + // For the bare-ptr calling convention, promote memref results to + // descriptors. + assert(results[i].size() == 1 && "expected one converted result"); + results[i].front() = MemRefDescriptor::fromStaticShape( + rewriter, callOp.getLoc(), *this->getTypeConverter(), memrefType, + results[i].front()); + } + if (unrankedMemrefType) { + assert(!useBarePtrCallConv && "unranked memref is not supported in the " + "bare-ptr calling convention"); + assert(results[i].size() == 1 && "expected one converted result"); + Value desc = this->copyUnrankedDescriptor( + rewriter, callOp.getLoc(), unrankedMemrefType, results[i].front(), + /*toDynamic=*/false); + if (!desc) + return failure(); + results[i].front() = desc; } } - if (useBarePtrCallConv) { - // For the bare-ptr calling convention, promote memref results to - // descriptors. - assert(results.size() == resultTypes.size() && - "The number of arguments and types doesn't match"); - this->getTypeConverter()->promoteBarePtrsToDescriptors( - rewriter, callOp.getLoc(), resultTypes, results); - } else if (failed(this->copyUnrankedDescriptors(rewriter, callOp.getLoc(), - resultTypes, results, - /*toDynamic=*/false))) { - return failure(); - } - - rewriter.replaceOp(callOp, results); + rewriter.replaceOpWithMultiple(callOp, results); return success(); } }; @@ -606,7 +638,7 @@ public: symbolTables(symbolTables) {} LogicalResult - matchAndRewrite(func::CallOp callOp, OpAdaptor adaptor, + matchAndRewrite(func::CallOp callOp, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { bool useBarePtrCallConv = false; if (getTypeConverter()->getOptions().useBarePtrCallConv) { @@ -636,7 +668,7 @@ struct CallIndirectOpLowering using Super::Super; LogicalResult - matchAndRewrite(func::CallIndirectOp callIndirectOp, OpAdaptor adaptor, + matchAndRewrite(func::CallIndirectOp callIndirectOp, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { return matchAndRewriteImpl(callIndirectOp, adaptor, rewriter); } @@ -679,47 +711,50 @@ struct ReturnOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; LogicalResult - matchAndRewrite(func::ReturnOp op, OpAdaptor adaptor, + matchAndRewrite(func::ReturnOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { Location loc = op.getLoc(); - unsigned numArguments = op.getNumOperands(); SmallVector updatedOperands; auto funcOp = op->getParentOfType(); bool useBarePtrCallConv = shouldUseBarePtrCallConv(funcOp, this->getTypeConverter()); - for (auto [oldOperand, newOperand] : + for (auto [oldOperand, newOperands] : llvm::zip_equal(op->getOperands(), adaptor.getOperands())) { Type oldTy = oldOperand.getType(); if (auto memRefType = dyn_cast(oldTy)) { + assert(newOperands.size() == 1 && "expected one converted result"); if (useBarePtrCallConv && getTypeConverter()->canConvertToBarePtr(memRefType)) { // For the bare-ptr calling convention, extract the aligned pointer to // be returned from the memref descriptor. - MemRefDescriptor memrefDesc(newOperand); + MemRefDescriptor memrefDesc(newOperands.front()); updatedOperands.push_back(memrefDesc.allocatedPtr(rewriter, loc)); continue; } } else if (auto unrankedMemRefType = dyn_cast(oldTy)) { + assert(newOperands.size() == 1 && "expected one converted result"); if (useBarePtrCallConv) { // Unranked memref is not supported in the bare pointer calling // convention. return failure(); } - Value updatedDesc = copyUnrankedDescriptor( - rewriter, loc, unrankedMemRefType, newOperand, /*toDynamic=*/true); + Value updatedDesc = + copyUnrankedDescriptor(rewriter, loc, unrankedMemRefType, + newOperands.front(), /*toDynamic=*/true); if (!updatedDesc) return failure(); updatedOperands.push_back(updatedDesc); continue; } - updatedOperands.push_back(newOperand); + + llvm::append_range(updatedOperands, newOperands); } // If ReturnOp has 0 or 1 operand, create it and return immediately. - if (numArguments <= 1) { + if (updatedOperands.size() <= 1) { rewriter.replaceOpWithNewOp( op, TypeRange(), updatedOperands, op->getAttrs()); return success(); diff --git a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp index 1a9bf569086d..cb9dea108cc4 100644 --- a/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp +++ b/mlir/lib/Conversion/LLVMCommon/TypeConverter.cpp @@ -365,6 +365,7 @@ Type LLVMTypeConverter::convertFunctionSignatureImpl( useBarePtrCallConv = useBarePtrCallConv || options.useBarePtrCallConv; auto funcArgConverter = useBarePtrCallConv ? barePtrFuncArgTypeConverter : structFuncArgTypeConverter; + // Convert argument types one by one and check for errors. for (auto [idx, type] : llvm::enumerate(funcTy.getInputs())) { SmallVector converted; @@ -658,27 +659,19 @@ FailureOr LLVMTypeConverter::convertVectorType(VectorType type) const { /// UnrankedMemRefType, are converted following the specific rules for the /// calling convention. Calling convention independent types are converted /// following the default LLVM type conversions. -Type LLVMTypeConverter::convertCallingConventionType( - Type type, bool useBarePtrCallConv) const { - if (useBarePtrCallConv) - if (auto memrefTy = dyn_cast(type)) - return convertMemRefToBarePtr(memrefTy); +LogicalResult LLVMTypeConverter::convertCallingConventionType( + Type type, SmallVectorImpl &result, bool useBarePtrCallConv) const { + if (useBarePtrCallConv) { + if (auto memrefTy = dyn_cast(type)) { + Type converted = convertMemRefToBarePtr(memrefTy); + if (!converted) + return failure(); + result.push_back(converted); + return success(); + } + } - return convertType(type); -} - -/// Promote the bare pointers in 'values' that resulted from memrefs to -/// descriptors. 'stdTypes' holds they types of 'values' before the conversion -/// to the LLVM-IR dialect (i.e., MemRefType, or any other builtin type). -void LLVMTypeConverter::promoteBarePtrsToDescriptors( - ConversionPatternRewriter &rewriter, Location loc, ArrayRef stdTypes, - SmallVectorImpl &values) const { - assert(stdTypes.size() == values.size() && - "The number of types and values doesn't match"); - for (unsigned i = 0, end = values.size(); i < end; ++i) - if (auto memrefTy = dyn_cast(stdTypes[i])) - values[i] = MemRefDescriptor::fromStaticShape(rewriter, loc, *this, - memrefTy, values[i]); + return convertType(type, result); } /// Convert a non-empty list of types of values produced by an operation into an @@ -706,23 +699,35 @@ Type LLVMTypeConverter::packOperationResults(TypeRange types) const { /// LLVM-compatible type. In particular, if more than one value is returned, /// create an LLVM dialect structure type with elements that correspond to each /// of the types converted with `convertCallingConventionType`. -Type LLVMTypeConverter::packFunctionResults(TypeRange types, - bool useBarePtrCallConv) const { +Type LLVMTypeConverter::packFunctionResults( + TypeRange types, bool useBarePtrCallConv, + SmallVector> *groupedTypes, + int64_t *numConvertedTypes) const { assert(!types.empty() && "expected non-empty list of type"); + assert((!groupedTypes || groupedTypes->empty()) && + "expected groupedTypes to be empty"); useBarePtrCallConv |= options.useBarePtrCallConv; - if (types.size() == 1) - return convertCallingConventionType(types.front(), useBarePtrCallConv); - SmallVector resultTypes; resultTypes.reserve(types.size()); + size_t sizeBefore = 0; for (auto t : types) { - auto converted = convertCallingConventionType(t, useBarePtrCallConv); - if (!converted || !LLVM::isCompatibleType(converted)) + if (failed( + convertCallingConventionType(t, resultTypes, useBarePtrCallConv))) return {}; - resultTypes.push_back(converted); + if (groupedTypes) { + SmallVector &group = groupedTypes->emplace_back(); + llvm::append_range(group, ArrayRef(resultTypes).drop_front(sizeBefore)); + } + sizeBefore = resultTypes.size(); } + if (numConvertedTypes) + *numConvertedTypes = resultTypes.size(); + if (resultTypes.size() == 1) + return resultTypes.front(); + if (resultTypes.empty()) + return {}; return LLVM::LLVMStructType::getLiteral(&getContext(), resultTypes); } @@ -740,40 +745,50 @@ Value LLVMTypeConverter::promoteOneMemRefDescriptor(Location loc, Value operand, return allocated; } -SmallVector -LLVMTypeConverter::promoteOperands(Location loc, ValueRange opOperands, - ValueRange operands, OpBuilder &builder, - bool useBarePtrCallConv) const { - SmallVector promotedOperands; - promotedOperands.reserve(operands.size()); - useBarePtrCallConv |= options.useBarePtrCallConv; - for (auto it : llvm::zip(opOperands, operands)) { - auto operand = std::get<0>(it); - auto llvmOperand = std::get<1>(it); +SmallVector LLVMTypeConverter::promoteOperands( + Location loc, ValueRange opOperands, ValueRange adaptorOperands, + OpBuilder &builder, bool useBarePtrCallConv) const { + SmallVector ranges; + for (size_t i = 0, e = adaptorOperands.size(); i < e; i++) + ranges.push_back(adaptorOperands.slice(i, 1)); + return promoteOperands(loc, opOperands, ranges, builder, useBarePtrCallConv); +} +SmallVector LLVMTypeConverter::promoteOperands( + Location loc, ValueRange opOperands, ArrayRef adaptorOperands, + OpBuilder &builder, bool useBarePtrCallConv) const { + SmallVector promotedOperands; + promotedOperands.reserve(adaptorOperands.size()); + useBarePtrCallConv |= options.useBarePtrCallConv; + for (auto [operand, llvmOperand] : + llvm::zip_equal(opOperands, adaptorOperands)) { if (useBarePtrCallConv) { // For the bare-ptr calling convention, we only have to extract the // aligned pointer of a memref. if (isa(operand.getType())) { - MemRefDescriptor desc(llvmOperand); - llvmOperand = desc.alignedPtr(builder, loc); + assert(llvmOperand.size() == 1 && "Expected a single operand"); + MemRefDescriptor desc(llvmOperand.front()); + promotedOperands.push_back(desc.alignedPtr(builder, loc)); + continue; } else if (isa(operand.getType())) { llvm_unreachable("Unranked memrefs are not supported"); } } else { if (isa(operand.getType())) { - UnrankedMemRefDescriptor::unpack(builder, loc, llvmOperand, + assert(llvmOperand.size() == 1 && "Expected a single operand"); + UnrankedMemRefDescriptor::unpack(builder, loc, llvmOperand.front(), promotedOperands); continue; } if (auto memrefType = dyn_cast(operand.getType())) { - MemRefDescriptor::unpack(builder, loc, llvmOperand, memrefType, + assert(llvmOperand.size() == 1 && "Expected a single operand"); + MemRefDescriptor::unpack(builder, loc, llvmOperand.front(), memrefType, promotedOperands); continue; } } - promotedOperands.push_back(llvmOperand); + llvm::append_range(promotedOperands, llvmOperand); } return promotedOperands; } @@ -802,11 +817,7 @@ mlir::structFuncArgTypeConverter(const LLVMTypeConverter &converter, Type type, result.append(converted.begin(), converted.end()); return success(); } - auto converted = converter.convertType(type); - if (!converted) - return failure(); - result.push_back(converted); - return success(); + return converter.convertType(type, result); } /// Callback to convert function argument types. It converts MemRef function @@ -814,11 +825,7 @@ mlir::structFuncArgTypeConverter(const LLVMTypeConverter &converter, Type type, LogicalResult mlir::barePtrFuncArgTypeConverter(const LLVMTypeConverter &converter, Type type, SmallVectorImpl &result) { - auto llvmTy = converter.convertCallingConventionType( - type, /*useBarePointerCallConv=*/true); - if (!llvmTy) - return failure(); - - result.push_back(llvmTy); - return success(); + return converter.convertCallingConventionType( + type, result, + /*useBarePointerCallConv=*/true); } diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp index a1f38c95935a..2b7bdc9a7b7f 100644 --- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp +++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp @@ -156,19 +156,21 @@ struct ConvertAlloc final : public OpConversionPattern { Type sizeTType = emitc::SizeTType::get(rewriter.getContext()); Type elementType = memrefType.getElementType(); IndexType indexType = rewriter.getIndexType(); - emitc::CallOpaqueOp sizeofElementOp = rewriter.create( - loc, sizeTType, rewriter.getStringAttr("sizeof"), ValueRange{}, + emitc::CallOpaqueOp sizeofElementOp = emitc::CallOpaqueOp::create( + rewriter, loc, sizeTType, rewriter.getStringAttr("sizeof"), + ValueRange{}, ArrayAttr::get(rewriter.getContext(), {TypeAttr::get(elementType)})); int64_t numElements = 1; for (int64_t dimSize : memrefType.getShape()) { numElements *= dimSize; } - Value numElementsValue = rewriter.create( - loc, indexType, rewriter.getIndexAttr(numElements)); + Value numElementsValue = emitc::ConstantOp::create( + rewriter, loc, indexType, rewriter.getIndexAttr(numElements)); - Value totalSizeBytes = rewriter.create( - loc, sizeTType, sizeofElementOp.getResult(0), numElementsValue); + Value totalSizeBytes = + emitc::MulOp::create(rewriter, loc, sizeTType, + sizeofElementOp.getResult(0), numElementsValue); emitc::CallOpaqueOp allocCall; StringAttr allocFunctionName; @@ -176,8 +178,8 @@ struct ConvertAlloc final : public OpConversionPattern { SmallVector argsVec; if (allocOp.getAlignment()) { allocFunctionName = rewriter.getStringAttr(alignedAllocFunctionName); - alignmentValue = rewriter.create( - loc, sizeTType, + alignmentValue = emitc::ConstantOp::create( + rewriter, loc, sizeTType, rewriter.getIntegerAttr(indexType, allocOp.getAlignment().value_or(0))); argsVec.push_back(alignmentValue); @@ -188,15 +190,15 @@ struct ConvertAlloc final : public OpConversionPattern { argsVec.push_back(totalSizeBytes); ValueRange args(argsVec); - allocCall = rewriter.create( - loc, + allocCall = emitc::CallOpaqueOp::create( + rewriter, loc, emitc::PointerType::get( emitc::OpaqueType::get(rewriter.getContext(), "void")), allocFunctionName, args); emitc::PointerType targetPointerType = emitc::PointerType::get(elementType); - emitc::CastOp castOp = rewriter.create( - loc, targetPointerType, allocCall.getResult(0)); + emitc::CastOp castOp = emitc::CastOp::create( + rewriter, loc, targetPointerType, allocCall.getResult(0)); rewriter.replaceOp(allocOp, castOp); return success(); diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp index a51890248271..a073a9acf752 100644 --- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp +++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitCPass.cpp @@ -33,8 +33,8 @@ namespace { emitc::IncludeOp addStandardHeader(OpBuilder &builder, ModuleOp module, StringRef headerName) { StringAttr includeAttr = builder.getStringAttr(headerName); - return builder.create( - module.getLoc(), includeAttr, + return emitc::IncludeOp::create( + builder, module.getLoc(), includeAttr, /*is_standard_include=*/builder.getUnitAttr()); } diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index f7f538179952..c6c5ab356f25 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -1106,12 +1106,10 @@ struct NVGPUGenerateWarpgroupDescriptorLowering // // [0,14) start_address dsc = insertBit(dsc, basePtr14bit, startBaseAddrBit); - LDBG() << "Generating warpgroup.descriptor: " - << "leading_off:" << leadDimVal << "\t" - << "stride_off :" << strideDimVal << "\t" - << "base_offset:" << offsetVal << "\t" - << "layout_type:" << swizzle << " (" - << nvgpu::stringifyTensorMapSwizzleKind(swizzleKind) + LDBG() << "Generating warpgroup.descriptor: " << "leading_off:" + << leadDimVal << "\t" << "stride_off :" << strideDimVal << "\t" + << "base_offset:" << offsetVal << "\t" << "layout_type:" << swizzle + << " (" << nvgpu::stringifyTensorMapSwizzleKind(swizzleKind) << ")\n start_addr : " << baseAddr; rewriter.replaceOp(op, dsc); @@ -1401,14 +1399,12 @@ struct NVGPUWarpgroupMmaOpLowering /// This function generates a WgmmaMmaAsyncOp using provided GMMA matrix /// descriptors and arranges them based on induction variables: i, j, and k. Value generateWgmma(int i, int j, int k, Value matrixC) { - LDBG() << "\t wgmma." - << "m" << wgmmaM << "n" << wgmmaN << "k" << wgmmaK << "(A[" - << (iterationM * wgmmaM) << ":" << (iterationM * wgmmaM) + wgmmaM - << "][" << (iterationK * wgmmaK) << ":" - << (iterationK * wgmmaK + wgmmaK) << "] * " - << " B[" << (iterationK * wgmmaK) << ":" - << (iterationK * wgmmaK + wgmmaK) << "][" << 0 << ":" << wgmmaN - << "])"; + LDBG() << "\t wgmma." << "m" << wgmmaM << "n" << wgmmaN << "k" << wgmmaK + << "(A[" << (iterationM * wgmmaM) << ":" + << (iterationM * wgmmaM) + wgmmaM << "][" << (iterationK * wgmmaK) + << ":" << (iterationK * wgmmaK + wgmmaK) << "] * " << " B[" + << (iterationK * wgmmaK) << ":" << (iterationK * wgmmaK + wgmmaK) + << "][" << 0 << ":" << wgmmaN << "])"; Value descriptorA = iterateDescriptorA(adaptor.getDescriptorA(), i, j, k); Value descriptorB = iterateDescriptorB(adaptor.getDescriptorB(), i, j, k); diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp index 86edc2bcc276..b405ec2201bf 100644 --- a/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp +++ b/mlir/lib/Dialect/Affine/Analysis/AffineStructures.cpp @@ -93,13 +93,13 @@ FlatAffineValueConstraints::addAffineForOpDomain(AffineForOp forOp) { int64_t lb = forOp.getConstantLowerBound(); dividend[pos] = 1; dividend.back() -= lb; - addLocalFloorDiv(dividend, step); + unsigned qPos = addLocalFloorDiv(dividend, step); // Second constraint: (iv - lb) - step * q = 0. SmallVector eq(getNumCols(), 0); eq[pos] = 1; eq.back() -= lb; // For the local var just added above. - eq[getNumCols() - 2] = -step; + eq[qPos] = -step; addEquality(eq); } } diff --git a/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp b/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp index c55e26e722f3..06d7e07005f8 100644 --- a/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp +++ b/mlir/lib/Dialect/EmitC/Transforms/WrapFuncInClass.cpp @@ -64,8 +64,8 @@ public: TypeAttr typeAttr = TypeAttr::get(val.getType()); fields.push_back({fieldName, typeAttr}); - FieldOp fieldop = rewriter.create( - funcOp->getLoc(), fieldName, typeAttr, nullptr); + FieldOp fieldop = emitc::FieldOp::create(rewriter, funcOp->getLoc(), + fieldName, typeAttr, nullptr); if (argAttrs && idx < argAttrs->size()) { fieldop->setDiscardableAttrs(funcOp.getArgAttrDict(idx)); diff --git a/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp b/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp index 894de4408c37..e004d5f64733 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/BasicPtxBuilderInterface.cpp @@ -107,11 +107,32 @@ void PtxBuilder::insertValue(Value v, PTXRegisterMod itype) { ss << getModifier() << getRegisterType(v) << ","; } +/// Check if the operation needs to pack and unpack results. +static bool needsPackUnpack(BasicPtxBuilderInterface interfaceOp) { + return interfaceOp->getNumResults() > 1; +} + +/// Pack the result types of the interface operation. +/// If the operation has multiple results, it packs them into a struct +/// type. Otherwise, it returns the original result types. +static SmallVector packResultTypes(MLIRContext *ctx, + BasicPtxBuilderInterface interfaceOp) { + TypeRange results = interfaceOp->getResultTypes(); + + if (!needsPackUnpack(interfaceOp)) + return llvm::to_vector<1>(results); + + SmallVector elems(results.begin(), results.end()); + auto sTy = LLVM::LLVMStructType::getLiteral(ctx, elems, /*isPacked=*/false); + return {sTy}; +} + LLVM::InlineAsmOp PtxBuilder::build() { + MLIRContext *ctx = interfaceOp->getContext(); auto asmDialectAttr = LLVM::AsmDialectAttr::get(interfaceOp->getContext(), LLVM::AsmDialect::AD_ATT); - auto resultTypes = interfaceOp->getResultTypes(); + SmallVector resultTypes = packResultTypes(ctx, interfaceOp); // Remove the last comma from the constraints string. if (!registerConstraints.empty() && @@ -136,7 +157,7 @@ LLVM::InlineAsmOp PtxBuilder::build() { rewriter, interfaceOp->getLoc(), /*result types=*/resultTypes, /*operands=*/ptxOperands, - /*asm_string=*/llvm::StringRef(ptxInstruction), + /*asm_string=*/ptxInstruction, /*constraints=*/registerConstraints.data(), /*has_side_effects=*/interfaceOp.hasSideEffect(), /*is_align_stack=*/false, LLVM::TailCallKind::None, @@ -147,9 +168,34 @@ LLVM::InlineAsmOp PtxBuilder::build() { void PtxBuilder::buildAndReplaceOp() { LLVM::InlineAsmOp inlineAsmOp = build(); LLVM_DEBUG(DBGS() << "\n Generated PTX \n\t" << inlineAsmOp << "\n"); - if (inlineAsmOp->getNumResults() == interfaceOp->getNumResults()) { - rewriter.replaceOp(interfaceOp, inlineAsmOp); - } else { + + // Case 1: no result + if (inlineAsmOp->getNumResults() == 0) { rewriter.eraseOp(interfaceOp); + return; } + + // Case 2: single result, forward it directly + if (!needsPackUnpack(interfaceOp)) { + rewriter.replaceOp(interfaceOp, inlineAsmOp->getResults()); + return; + } + + // Case 3: multiple results were packed; unpack the struct. + assert(mlir::LLVM::LLVMStructType::classof( + inlineAsmOp.getResultTypes().front()) && + "Expected result type to be LLVMStructType when unpacking multiple " + "results"); + auto structTy = llvm::cast( + inlineAsmOp.getResultTypes().front()); + + SmallVector unpacked; + Value structVal = inlineAsmOp.getResult(0); + for (auto [idx, elemTy] : llvm::enumerate(structTy.getBody())) { + Value unpackedValue = LLVM::ExtractValueOp::create( + rewriter, interfaceOp->getLoc(), structVal, idx); + unpacked.push_back(unpackedValue); + } + + rewriter.replaceOp(interfaceOp, unpacked); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp index d56506969662..22690daa4f9e 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @@ -691,9 +691,9 @@ struct DropPadUnitDims : public OpRewritePattern { auto newResultType = RankedTensorType::get( newResultShape, padOp.getResultType().getElementType()); - auto newPadOp = rewriter.create( - padOp.getLoc(), /*result=*/newResultType, collapsedSource, newLowPad, - newHighPad, paddingVal, padOp.getNofold()); + auto newPadOp = tensor::PadOp::create( + rewriter, padOp.getLoc(), /*result=*/newResultType, collapsedSource, + newLowPad, newHighPad, paddingVal, padOp.getNofold()); Value dest = padOp.getResult(); if (options.rankReductionStrategy == diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp index 9ec4af6d4581..2650488c1799 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp @@ -52,11 +52,11 @@ FailureOr mlir::linalg::transposeMatmul(RewriterBase &rewriter, dynamicDims.push_back(tensor::DimOp::create(rewriter, loc, input, 0)); ArrayRef shape = type.getShape(); - Value empty = rewriter.create( - loc, ArrayRef{shape[1], shape[0]}, type.getElementType(), - dynamicDims); - auto transposeOp = rewriter.create( - loc, input, empty, ArrayRef{1, 0}); + Value empty = tensor::EmptyOp::create(rewriter, loc, + ArrayRef{shape[1], shape[0]}, + type.getElementType(), dynamicDims); + auto transposeOp = linalg::TransposeOp::create(rewriter, loc, input, empty, + ArrayRef{1, 0}); Operation *newMatmulOp; if (transposeLHS) { newMatmulOp = MatmulTransposeAOp::create( @@ -112,8 +112,8 @@ mlir::linalg::transposeBatchMatmul(RewriterBase &rewriter, Value empty = tensor::EmptyOp::create( rewriter, loc, ArrayRef{shape[0], shape[2], shape[1]}, type.getElementType(), dynamicDims); - auto transposeOp = rewriter.create( - loc, input, empty, ArrayRef{0, 2, 1}); + auto transposeOp = linalg::TransposeOp::create(rewriter, loc, input, empty, + ArrayRef{0, 2, 1}); Operation *newMatmulOp; if (transposeLHS) { newMatmulOp = BatchMatmulTransposeAOp::create( diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index c1c1767ef90b..fa94219016c1 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -3874,6 +3874,107 @@ LogicalResult AllocateDirOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// TargetAllocMemOp +//===----------------------------------------------------------------------===// + +mlir::Type omp::TargetAllocMemOp::getAllocatedType() { + return getInTypeAttr().getValue(); +} + +/// operation ::= %res = (`omp.target_alloc_mem`) $device : devicetype, +/// $in_type ( `(` $typeparams `)` )? ( `,` $shape )? +/// attr-dict-without-keyword +static mlir::ParseResult parseTargetAllocMemOp(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + auto &builder = parser.getBuilder(); + bool hasOperands = false; + std::int32_t typeparamsSize = 0; + + // Parse device number as a new operand + mlir::OpAsmParser::UnresolvedOperand deviceOperand; + mlir::Type deviceType; + if (parser.parseOperand(deviceOperand) || parser.parseColonType(deviceType)) + return mlir::failure(); + if (parser.resolveOperand(deviceOperand, deviceType, result.operands)) + return mlir::failure(); + if (parser.parseComma()) + return mlir::failure(); + + mlir::Type intype; + if (parser.parseType(intype)) + return mlir::failure(); + result.addAttribute("in_type", mlir::TypeAttr::get(intype)); + llvm::SmallVector operands; + llvm::SmallVector typeVec; + if (!parser.parseOptionalLParen()) { + // parse the LEN params of the derived type. ( : ) + if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::None) || + parser.parseColonTypeList(typeVec) || parser.parseRParen()) + return mlir::failure(); + typeparamsSize = operands.size(); + hasOperands = true; + } + std::int32_t shapeSize = 0; + if (!parser.parseOptionalComma()) { + // parse size to scale by, vector of n dimensions of type index + if (parser.parseOperandList(operands, mlir::OpAsmParser::Delimiter::None)) + return mlir::failure(); + shapeSize = operands.size() - typeparamsSize; + auto idxTy = builder.getIndexType(); + for (std::int32_t i = typeparamsSize, end = operands.size(); i != end; ++i) + typeVec.push_back(idxTy); + hasOperands = true; + } + if (hasOperands && + parser.resolveOperands(operands, typeVec, parser.getNameLoc(), + result.operands)) + return mlir::failure(); + + mlir::Type restype = builder.getIntegerType(64); + if (!restype) { + parser.emitError(parser.getNameLoc(), "invalid allocate type: ") << intype; + return mlir::failure(); + } + llvm::SmallVector segmentSizes{1, typeparamsSize, shapeSize}; + result.addAttribute("operandSegmentSizes", + builder.getDenseI32ArrayAttr(segmentSizes)); + if (parser.parseOptionalAttrDict(result.attributes) || + parser.addTypeToList(restype, result.types)) + return mlir::failure(); + return mlir::success(); +} + +mlir::ParseResult omp::TargetAllocMemOp::parse(mlir::OpAsmParser &parser, + mlir::OperationState &result) { + return parseTargetAllocMemOp(parser, result); +} + +void omp::TargetAllocMemOp::print(mlir::OpAsmPrinter &p) { + p << " "; + p.printOperand(getDevice()); + p << " : "; + p << getDevice().getType(); + p << ", "; + p << getInType(); + if (!getTypeparams().empty()) { + p << '(' << getTypeparams() << " : " << getTypeparams().getTypes() << ')'; + } + for (auto sh : getShape()) { + p << ", "; + p.printOperand(sh); + } + p.printOptionalAttrDict((*this)->getAttrs(), + {"in_type", "operandSegmentSizes"}); +} + +llvm::LogicalResult omp::TargetAllocMemOp::verify() { + mlir::Type outType = getType(); + if (!mlir::dyn_cast(outType)) + return emitOpError("must be a integer type"); + return mlir::success(); +} + #define GET_ATTRDEF_CLASSES #include "mlir/Dialect/OpenMP/OpenMPOpsAttributes.cpp.inc" diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp index 89731de1df05..0dbc041d231a 100644 --- a/mlir/lib/Dialect/SCF/IR/SCF.cpp +++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp @@ -4236,14 +4236,15 @@ LogicalResult scf::IndexSwitchOp::verify() { << "see yield operation here"; } for (auto [idx, result, operand] : - llvm::zip(llvm::seq(0, getNumResults()), getResultTypes(), - yield.getOperandTypes())) { - if (result == operand) + llvm::enumerate(getResultTypes(), yield.getOperands())) { + if (!operand) + return yield.emitOpError() << "operand " << idx << " is null\n"; + if (result == operand.getType()) continue; return (emitOpError("expected result #") << idx << " of each region to be " << result) .attachNote(yield.getLoc()) - << name << " returns " << operand << " here"; + << name << " returns " << operand.getType() << " here"; } return success(); }; diff --git a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp index 3b97786e5815..dabbea1bdec6 100644 --- a/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp +++ b/mlir/lib/Dialect/SparseTensor/Pipelines/SparseTensorPipelines.cpp @@ -71,7 +71,6 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm, pm.addPass(createLowerAffinePass()); pm.addPass( createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions())); - pm.addPass(createFinalizeMemRefToLLVMConversionPass()); pm.addNestedPass(createConvertComplexToStandardPass()); pm.addNestedPass(arith::createArithExpandOpsPass()); pm.addNestedPass(createConvertMathToLLVMPass()); @@ -79,12 +78,6 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm, pm.addPass(createConvertComplexToLibm()); pm.addPass( createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions())); - pm.addPass(createConvertComplexToLLVMPass()); - pm.addPass( - createConvertVectorToLLVMPass(options.convertVectorToLLVMOptions())); - pm.addPass(createConvertFuncToLLVMPass()); - pm.addPass(createArithToLLVMConversionPass()); - pm.addPass(createConvertControlFlowToLLVMPass()); // Finalize GPU code generation. if (gpuCodegen) { @@ -99,8 +92,8 @@ void mlir::sparse_tensor::buildSparsifier(OpPassManager &pm, pm.addPass(createGpuModuleToBinaryPass(gpuModuleToBinaryPassOptions)); } - // Convert poison values. - pm.addPass(createUBToLLVMConversionPass()); + // Convert to LLVM. + pm.addPass(createConvertToLLVMPass()); // Ensure all casts are realized. pm.addPass(createReconcileUnrealizedCastsPass()); diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt index 7c6a4f37db9a..7869a28dfed5 100644 --- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt @@ -17,6 +17,8 @@ add_mlir_dialect_library(MLIRXeGPUDialect MLIRAffineUtils MLIRArithUtils MLIRDialectUtils + MLIRGPUDialect + MLIRXeVMDialect MLIRIR MLIRViewLikeInterface MLIRVectorDialect diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index d997296a22c2..8ea8cb1f4597 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -67,7 +67,7 @@ genOffsetsComputingInsts(OpBuilder &builder, Location loc, StaticTileOffsetRange(sizePerWg, distUnit)) { SmallVector base = llvm::map_to_vector(unitOffs, [&](int64_t d) -> Value { - return builder.create(loc, d); + return arith::ConstantIndexOp::create(builder, loc, d); }); SmallVector adds = llvm::map_to_vector( @@ -80,7 +80,7 @@ genOffsetsComputingInsts(OpBuilder &builder, Location loc, llvm::zip_equal(adds, sizePerWg), [&](const auto &t) -> Value { return builder.createOrFold( loc, std::get<0>(t), - builder.create(loc, std::get<1>(t))); + arith::ConstantIndexOp::create(builder, loc, std::get<1>(t))); }); offsets.push_back(mods); @@ -427,7 +427,7 @@ RangeAttr::verify(llvm::function_ref emitError, // XeGPU_TensorDescType //===----------------------------------------------------------------------===// -mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { +mlir::Type TensorDescType::parse(AsmParser &parser) { llvm::SmallVector shape; mlir::Type elementType; mlir::FailureOr encoding; @@ -477,7 +477,7 @@ mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { layout.value_or(mlir::Attribute())); } -void TensorDescType::print(::mlir::AsmPrinter &printer) const { +void TensorDescType::print(AsmPrinter &printer) const { printer << "<"; auto shape = getShape(); @@ -522,10 +522,10 @@ TensorDescType TensorDescType::get(llvm::ArrayRef shape, return Base::get(context, shape, elementType, attr, layout); } -LogicalResult TensorDescType::verify( - llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, - llvm::ArrayRef shape, mlir::Type elementType, - mlir::Attribute encoding, mlir::Attribute layout) { +LogicalResult +TensorDescType::verify(llvm::function_ref emitError, + llvm::ArrayRef shape, mlir::Type elementType, + mlir::Attribute encoding, mlir::Attribute layout) { size_t rank = shape.size(); if (rank == 0) @@ -591,6 +591,119 @@ LogicalResult TensorDescType::verify( return success(); } +//===----------------------------------------------------------------------===// +// XeGPU_MemDescType +//===----------------------------------------------------------------------===// +mlir::Type MemDescType::parse(AsmParser &parser) { + llvm::SmallVector shape; + mlir::Type elementType; + mlir::FailureOr layout; + + // Parse literal '<' + if (parser.parseLess()) + return {}; + + auto shapeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseDimensionList(shape, false, true))) { + parser.emitError(shapeLoc, "failed to parse parameter 'shape'"); + return {}; + } + + auto elemTypeLoc = parser.getCurrentLocation(); + if (mlir::failed(parser.parseType(elementType))) { + parser.emitError(elemTypeLoc, "failed to parse parameter 'elementType'"); + return {}; + } + + // parse optional attributes + if (mlir::succeeded(parser.parseOptionalComma())) { + MemLayoutAttr attr; + ParseResult res = parser.parseAttribute(attr); + if (mlir::failed(res)) + return {}; + layout = attr; + } + + // Parse literal '>' + if (parser.parseGreater()) + return {}; + + MLIRContext *ctxt = parser.getContext(); + return MemDescType::getChecked( + [&]() { return parser.emitError(parser.getNameLoc()); }, ctxt, shape, + elementType, layout.value_or(MemLayoutAttr())); +} + +void MemDescType::print(AsmPrinter &printer) const { + printer << "<"; + + printer.printDimensionList(getShape()); + printer << 'x'; + printer << getElementType(); + + if (auto layout = getMemLayout()) + printer << ", " << layout; + + printer << ">"; +} + +//===----------------------------------------------------------------------===// +// XeGPU_MemDescType +//===----------------------------------------------------------------------===// + +Attribute MemLayoutAttr::parse(AsmParser &parser, Type type) { + + auto context = parser.getContext(); + llvm::SMLoc loc = parser.getCurrentLocation(); + + llvm::SmallDenseSet seenKeys; + SmallVector attributes; + + auto parseElt = [&]() -> ParseResult { + StringRef nameId; + if (failed(parser.parseKeyword(&nameId))) + return parser.emitError(loc, "expected valid attribute name"); + + if (!seenKeys.insert(nameId).second) + return parser.emitError(loc, "duplicate key '") + << nameId << " in mem layout attribute"; + + if (failed(parser.parseEqual())) + return failure(); + + Attribute attr; + if (failed(parser.parseAttribute(attr))) + return failure(); + attributes.emplace_back(nameId, attr); + return success(); + }; + + // Parse literal '<' + if (parser.parseLess()) + return {}; + + if (failed(parser.parseCommaSeparatedList(parseElt))) + return {}; + + // Parse literal '>' + if (parser.parseGreater()) + return {}; + + return parser.getChecked( + loc, context, DictionaryAttr::get(context, attributes)); +} + +void MemLayoutAttr::print(AsmPrinter &printer) const { + printer << "<"; + ArrayRef attrs = getAttrs().getValue(); + for (size_t i = 0; i < attrs.size(); i++) { + printer << attrs[i].getName().str() << " = " << attrs[i].getValue(); + if (i < attrs.size() - 1) + printer << ", "; + } + printer << ">"; +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 7b7ce19e6937..eee0fdc7160d 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Arith/Utils/Utils.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/XeVMDialect.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" @@ -21,6 +23,17 @@ namespace mlir { namespace xegpu { +bool isSharedMemory(const MemRefType &memrefTy) { + Attribute attr = memrefTy.getMemorySpace(); + if (auto intAttr = llvm::dyn_cast(attr)) + return intAttr.getInt() == 3; + if (auto memrefSpace = llvm::dyn_cast(attr)) + return memrefSpace.getValue() == MemorySpace::SLM; + if (auto xevmSpace = llvm::dyn_cast(attr)) + return xevmSpace.getValue() == xevm::AddrSpace::SHARED; + return gpu::GPUDialect::isWorkgroupMemoryAddressSpace(attr); +} + template static std::string makeString(T array, bool breakline = false) { std::string buf; @@ -919,6 +932,101 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns, patterns.add(context); } +//===----------------------------------------------------------------------===// +// XeGPU_LoadMatrixOp +//===----------------------------------------------------------------------===// +void LoadMatrixOp::build(OpBuilder &builder, OperationState &state, Type res, + TypedValue memDesc, + llvm::ArrayRef offsets, + LayoutTrait layout) { + llvm::SmallVector dynamicOffsets; + llvm::SmallVector staticOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + build(builder, state, res, memDesc, dynamicOffsets, staticOffsetsAttr, + layout); +} + +LogicalResult LoadMatrixOp::verify() { + VectorType resTy = getRes().getType(); + MemDescType mdescTy = getMemDesc().getType(); + + if (mdescTy.getRank() != 2) + return emitOpError("mem_desc must be 2D."); + + ArrayRef valueShape = resTy.getShape(); + ArrayRef mdescShape = mdescTy.getShape(); + if (llvm::any_of(llvm::zip_equal(valueShape, mdescShape), + [](auto p) { return std::get<0>(p) > std::get<1>(p); })) + return emitOpError("result shape must not exceed mem_desc shape."); + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_StoreMatrixOp +//===----------------------------------------------------------------------===// +void StoreMatrixOp::build(OpBuilder &builder, OperationState &state, Value data, + TypedValue memDesc, + llvm::ArrayRef offsets, + LayoutTrait layout) { + llvm::SmallVector dynamicOffsets; + llvm::SmallVector staticOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + build(builder, state, data, memDesc, dynamicOffsets, staticOffsetsAttr, + layout); +} + +LogicalResult StoreMatrixOp::verify() { + VectorType dataTy = getData().getType(); + MemDescType mdescTy = getMemDesc().getType(); + + if (mdescTy.getRank() != 2) + return emitOpError("mem_desc must be 2D."); + + ArrayRef dataShape = dataTy.getShape(); + ArrayRef mdescShape = mdescTy.getShape(); + if (llvm::any_of(llvm::zip_equal(dataShape, mdescShape), + [](auto p) { return std::get<0>(p) > std::get<1>(p); })) + return emitOpError("data shape must not exceed mem_desc shape."); + + return success(); +} + +//===----------------------------------------------------------------------===// +// XeGPU_MemDescSubviewOp +//===----------------------------------------------------------------------===// + +void MemDescSubviewOp::build(OpBuilder &builder, OperationState &state, + Type resTy, Value src, + llvm::ArrayRef offsets) { + llvm::SmallVector dynamicOffsets; + llvm::SmallVector staticOffsets; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); + build(builder, state, resTy, src, dynamicOffsets, staticOffsetsAttr); +} + +LogicalResult MemDescSubviewOp::verify() { + MemDescType srcTy = getSrc().getType(); + MemDescType resTy = getRes().getType(); + ArrayRef srcShape = srcTy.getShape(); + ArrayRef resShape = resTy.getShape(); + + if (srcTy.getRank() < resTy.getRank()) + return emitOpError("result rank must not exceed source rank."); + + if (llvm::any_of( + llvm::zip_equal(resShape, srcShape.take_back(resShape.size())), + [](auto p) { return std::get<0>(p) > std::get<1>(p); })) + return emitOpError("result shape must not exceed source shape."); + + if (srcTy.getStrides() != resTy.getStrides()) + return emitOpError("result must inherit the source strides."); + + return success(); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index 270d71aaa727..ecec186fe3fc 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -166,7 +166,7 @@ struct WgToSgCreateNdOp : public OpConversionPattern { // Subtract startOfRange from the original subgroup id to get // the adjusted sg id Value startOfRangeVal = - rewriter.create(loc, startOfRange); + arith::ConstantIndexOp::create(rewriter, loc, startOfRange); linearSgId = rewriter.createOrFold(loc, linearSgId, startOfRangeVal); } @@ -524,8 +524,8 @@ struct WgToSgElementwiseOp : public ConversionPattern { // is lowered to: // #a = #xegpu.layout // #b = #xegpu.layout -// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, matrix_desc<32x64xf32> -// %d = load_matrix %slm <{layout_result_0 = #a}> : matrix_desc<32x64xf32> -> vector<16x32xf32> +// store_matrix %1, %slm <{layout_input_0 = #a}> : vector<32x16>, mem_desc<32x64xf32> +// %d = load_matrix %slm <{layout_result_0 = #a}> : mem_desc<32x64xf32> -> vector<16x32xf32> // xegpu.convert_layout %d <{input_layout = #a, target_layout = #b}> : vector<16x32xf32> // clang-format on struct WgToSgConvertLayoutOp @@ -675,7 +675,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern { auto newType = VectorType::get(sgShape, vecType.getElementType()); auto sgAttr = DenseElementsAttr::get(newType, singleVal); auto cstOp = - rewriter.create(op.getLoc(), newType, sgAttr); + arith::ConstantOp::create(rewriter, op.getLoc(), newType, sgAttr); if (auto newLayout = layout.dropSgLayoutAndData()) xegpu::setLayoutAttr(cstOp->getResult(0), newLayout); SmallVector newConsts(count, cstOp); diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp index f704fbfbe8ff..52162a43aeae 100644 --- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp @@ -106,7 +106,7 @@ void ExecutionEngine::dumpToObjectFile(StringRef filename) { } // Compilation is lazy and it doesn't populate object cache unless requested. // In case object dump is requested before cache is populated, we need to - // force compilation manually. + // force compilation manually. if (cache->isEmpty()) { for (std::string &functionName : functionNames) { auto result = lookupPacked(functionName); @@ -400,13 +400,6 @@ ExecutionEngine::create(Operation *m, const ExecutionEngineOptions &options, return symbolMap; }; engine->registerSymbols(runtimeSymbolMap); - - // Execute the global constructors from the module being processed. - // TODO: Allow JIT initialize for AArch64. Currently there's a bug causing a - // crash for AArch64 see related issue #71963. - if (!engine->jit->getTargetTriple().isAArch64()) - cantFail(engine->jit->initialize(engine->jit->getMainJITDylib())); - return std::move(engine); } @@ -442,6 +435,7 @@ Expected ExecutionEngine::lookup(StringRef name) const { Error ExecutionEngine::invokePacked(StringRef name, MutableArrayRef args) { + initialize(); auto expectedFPtr = lookupPacked(name); if (!expectedFPtr) return expectedFPtr.takeError(); @@ -451,3 +445,13 @@ Error ExecutionEngine::invokePacked(StringRef name, return Error::success(); } + +void ExecutionEngine::initialize() { + if (isInitialized) + return; + // TODO: Allow JIT initialize for AArch64. Currently there's a bug causing a + // crash for AArch64 see related issue #71963. + if (!jit->getTargetTriple().isAArch64()) + cantFail(jit->initialize(jit->getMainJITDylib())); + isInitialized = true; +} diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp index 2107df37d199..0ada4cc96570 100644 --- a/mlir/lib/ExecutionEngine/JitRunner.cpp +++ b/mlir/lib/ExecutionEngine/JitRunner.cpp @@ -202,6 +202,8 @@ compileAndExecute(Options &options, Operation *module, StringRef entryPoint, auto engine = std::move(*expectedEngine); + engine->initialize(); + auto expectedFPtr = engine->lookupPacked(entryPoint); if (!expectedFPtr) return expectedFPtr.takeError(); diff --git a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp index 2f47939df5a0..af4ea5ac1cec 100644 --- a/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp +++ b/mlir/lib/Interfaces/Utils/InferIntRangeCommon.cpp @@ -290,8 +290,7 @@ static ConstantIntRanges inferDivURange(const ConstantIntRanges &lhs, DivisionFixupFn fixup) { const APInt &lhsMin = lhs.umin(), &lhsMax = lhs.umax(), &rhsMin = rhs.umin(), &rhsMax = rhs.umax(); - - if (!rhsMin.isZero()) { + if (!rhsMin.isZero() && !rhsMax.isZero()) { auto udiv = [&fixup](const APInt &a, const APInt &b) -> std::optional { return fixup(a, b, a.udiv(b)); diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp index eb96cb211fdd..6694de838353 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp @@ -5867,6 +5867,10 @@ static bool isTargetDeviceOp(Operation *op) { if (mlir::isa(op)) return true; + if (mlir::isa(op) || + mlir::isa(op)) + return true; + if (auto parentFn = op->getParentOfType()) if (auto declareTargetIface = llvm::dyn_cast( @@ -5879,6 +5883,85 @@ static bool isTargetDeviceOp(Operation *op) { return false; } +static llvm::Function *getOmpTargetAlloc(llvm::IRBuilderBase &builder, + llvm::Module *llvmModule) { + llvm::Type *i64Ty = builder.getInt64Ty(); + llvm::Type *i32Ty = builder.getInt32Ty(); + llvm::Type *returnType = builder.getPtrTy(0); + llvm::FunctionType *fnType = + llvm::FunctionType::get(returnType, {i64Ty, i32Ty}, false); + llvm::Function *func = cast( + llvmModule->getOrInsertFunction("omp_target_alloc", fnType).getCallee()); + return func; +} + +static LogicalResult +convertTargetAllocMemOp(Operation &opInst, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + auto allocMemOp = cast(opInst); + if (!allocMemOp) + return failure(); + + // Get "omp_target_alloc" function + llvm::Module *llvmModule = moduleTranslation.getLLVMModule(); + llvm::Function *ompTargetAllocFunc = getOmpTargetAlloc(builder, llvmModule); + // Get the corresponding device value in llvm + mlir::Value deviceNum = allocMemOp.getDevice(); + llvm::Value *llvmDeviceNum = moduleTranslation.lookupValue(deviceNum); + // Get the allocation size. + llvm::DataLayout dataLayout = llvmModule->getDataLayout(); + mlir::Type heapTy = allocMemOp.getAllocatedType(); + llvm::Type *llvmHeapTy = moduleTranslation.convertType(heapTy); + llvm::TypeSize typeSize = dataLayout.getTypeStoreSize(llvmHeapTy); + llvm::Value *allocSize = builder.getInt64(typeSize.getFixedValue()); + for (auto typeParam : allocMemOp.getTypeparams()) + allocSize = + builder.CreateMul(allocSize, moduleTranslation.lookupValue(typeParam)); + // Create call to "omp_target_alloc" with the args as translated llvm values. + llvm::CallInst *call = + builder.CreateCall(ompTargetAllocFunc, {allocSize, llvmDeviceNum}); + llvm::Value *resultI64 = builder.CreatePtrToInt(call, builder.getInt64Ty()); + + // Map the result + moduleTranslation.mapValue(allocMemOp.getResult(), resultI64); + return success(); +} + +static llvm::Function *getOmpTargetFree(llvm::IRBuilderBase &builder, + llvm::Module *llvmModule) { + llvm::Type *ptrTy = builder.getPtrTy(0); + llvm::Type *i32Ty = builder.getInt32Ty(); + llvm::Type *voidTy = builder.getVoidTy(); + llvm::FunctionType *fnType = + llvm::FunctionType::get(voidTy, {ptrTy, i32Ty}, false); + llvm::Function *func = dyn_cast( + llvmModule->getOrInsertFunction("omp_target_free", fnType).getCallee()); + return func; +} + +static LogicalResult +convertTargetFreeMemOp(Operation &opInst, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + auto freeMemOp = cast(opInst); + if (!freeMemOp) + return failure(); + + // Get "omp_target_free" function + llvm::Module *llvmModule = moduleTranslation.getLLVMModule(); + llvm::Function *ompTragetFreeFunc = getOmpTargetFree(builder, llvmModule); + // Get the corresponding device value in llvm + mlir::Value deviceNum = freeMemOp.getDevice(); + llvm::Value *llvmDeviceNum = moduleTranslation.lookupValue(deviceNum); + // Get the corresponding heapref value in llvm + mlir::Value heapref = freeMemOp.getHeapref(); + llvm::Value *llvmHeapref = moduleTranslation.lookupValue(heapref); + // Convert heapref int to ptr and call "omp_target_free" + llvm::Value *intToPtr = + builder.CreateIntToPtr(llvmHeapref, builder.getPtrTy(0)); + builder.CreateCall(ompTragetFreeFunc, {intToPtr, llvmDeviceNum}); + return success(); +} + /// Given an OpenMP MLIR operation, create the corresponding LLVM IR (including /// OpenMP runtime calls). static LogicalResult @@ -6053,6 +6136,12 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder, // the omp.canonical_loop. return applyUnrollHeuristic(op, builder, moduleTranslation); }) + .Case([&](omp::TargetAllocMemOp) { + return convertTargetAllocMemOp(*op, builder, moduleTranslation); + }) + .Case([&](omp::TargetFreeMemOp) { + return convertTargetFreeMemOp(*op, builder, moduleTranslation); + }) .Default([&](Operation *inst) { return inst->emitError() << "not yet implemented: " << inst->getName(); diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp index da811ba0954c..8d450520629e 100644 --- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp +++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp @@ -780,8 +780,9 @@ parsed_inst_t ExpressionParser::parseConstInst( auto parsedConstant = parser.parseLiteral(); if (failed(parsedConstant)) return failure(); - auto constOp = builder.create( - *currentOpLoc, buildLiteralAttr(builder, *parsedConstant)); + auto constOp = + ConstOp::create(builder, *currentOpLoc, + buildLiteralAttr(builder, *parsedConstant)); return {{constOp.getResult()}}; } @@ -929,8 +930,8 @@ private: << " type registration."; FunctionType type = symbols.moduleFuncTypes[tid.id]; std::string symbol = symbols.getNewFuncSymbolName(); - auto funcOp = - builder.create(loc, symbol, moduleName, importName, type); + auto funcOp = FuncImportOp::create(builder, loc, symbol, moduleName, + importName, type); symbols.funcSymbols.push_back({{FlatSymbolRefAttr::get(funcOp)}, type}); return funcOp.verify(); } @@ -939,8 +940,8 @@ private: LogicalResult visitImport(Location loc, StringRef moduleName, StringRef importName, LimitType limitType) { std::string symbol = symbols.getNewMemorySymbolName(); - auto memOp = builder.create(loc, symbol, moduleName, - importName, limitType); + auto memOp = MemImportOp::create(builder, loc, symbol, moduleName, + importName, limitType); symbols.memSymbols.push_back({FlatSymbolRefAttr::get(memOp)}); return memOp.verify(); } @@ -949,8 +950,8 @@ private: LogicalResult visitImport(Location loc, StringRef moduleName, StringRef importName, TableType tableType) { std::string symbol = symbols.getNewTableSymbolName(); - auto tableOp = builder.create(loc, symbol, moduleName, - importName, tableType); + auto tableOp = TableImportOp::create(builder, loc, symbol, moduleName, + importName, tableType); symbols.tableSymbols.push_back({FlatSymbolRefAttr::get(tableOp)}); return tableOp.verify(); } @@ -960,8 +961,8 @@ private: StringRef importName, GlobalTypeRecord globalType) { std::string symbol = symbols.getNewGlobalSymbolName(); auto giOp = - builder.create(loc, symbol, moduleName, importName, - globalType.type, globalType.isMutable); + GlobalImportOp::create(builder, loc, symbol, moduleName, importName, + globalType.type, globalType.isMutable); symbols.globalSymbols.push_back( {{FlatSymbolRefAttr::get(giOp)}, giOp.getType()}); return giOp.verify(); @@ -1012,7 +1013,7 @@ public: if (failed(fillRegistry)) return; - mOp = builder.create(getLocation()); + mOp = ModuleOp::create(builder, getLocation()); builder.setInsertionPointToStart(&mOp.getBodyRegion().front()); LogicalResult parsingTypes = parseSection(); if (failed(parsingTypes)) @@ -1172,7 +1173,7 @@ WasmBinaryParser::parseSectionItem(ParserHead &ph, LDBG() << " Parsed table description: " << *tableType; StringAttr symbol = builder.getStringAttr(symbols.getNewTableSymbolName()); auto tableOp = - builder.create(opLocation, symbol.strref(), *tableType); + TableOp::create(builder, opLocation, symbol.strref(), *tableType); symbols.tableSymbols.push_back({SymbolRefAttr::get(tableOp)}); return success(); } @@ -1190,11 +1191,11 @@ WasmBinaryParser::parseSectionItem(ParserHead &ph, return emitError(getLocation(), "invalid type index: ") << typeIdx; std::string symbol = symbols.getNewFuncSymbolName(); auto funcOp = - builder.create(opLoc, symbol, symbols.moduleFuncTypes[typeIdx]); + FuncOp::create(builder, opLoc, symbol, symbols.moduleFuncTypes[typeIdx]); Block *block = funcOp.addEntryBlock(); auto ip = builder.saveInsertionPoint(); builder.setInsertionPointToEnd(block); - builder.create(opLoc); + ReturnOp::create(builder, opLoc); builder.restoreInsertionPoint(ip); symbols.funcSymbols.push_back( {{FlatSymbolRefAttr::get(funcOp.getSymNameAttr())}, @@ -1225,7 +1226,7 @@ WasmBinaryParser::parseSectionItem(ParserHead &ph, LDBG() << " Registering memory " << *memory; std::string symbol = symbols.getNewMemorySymbolName(); - auto memOp = builder.create(opLocation, symbol, *memory); + auto memOp = MemOp::create(builder, opLocation, symbol, *memory); symbols.memSymbols.push_back({SymbolRefAttr::get(memOp)}); return success(); } diff --git a/mlir/lib/Transforms/Utils/DialectConversion.cpp b/mlir/lib/Transforms/Utils/DialectConversion.cpp index ff34a5896576..e48cfca48680 100644 --- a/mlir/lib/Transforms/Utils/DialectConversion.cpp +++ b/mlir/lib/Transforms/Utils/DialectConversion.cpp @@ -1637,6 +1637,11 @@ ValueRange ConversionPatternRewriterImpl::buildUnresolvedMaterialization( builder.setInsertionPoint(ip.getBlock(), ip.getPoint()); UnrealizedConversionCastOp convertOp = UnrealizedConversionCastOp::create(builder, loc, outputTypes, inputs); + if (config.attachDebugMaterializationKind) { + StringRef kindStr = + kind == MaterializationKind::Source ? "source" : "target"; + convertOp->setAttr("__kind__", builder.getStringAttr(kindStr)); + } if (isPureTypeConversion) convertOp->setAttr(kPureTypeConversionMarker, builder.getUnitAttr()); diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp index 607b86cb8631..0a2a0cc1d5c7 100644 --- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp +++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp @@ -871,7 +871,18 @@ LogicalResult RegionPatternRewriteDriver::simplify(bool *changed) && { ctx->executeAction( [&] { - continueRewrites = processWorklist(); + continueRewrites = false; + + // Erase unreachable blocks + // Operations like: + // %add = arith.addi %add, %add : i64 + // are legal in unreachable code. Unfortunately many patterns would be + // unsafe to apply on such IR and can lead to crashes or infinite + // loops. + continueRewrites |= + succeeded(eraseUnreachableBlocks(rewriter, region)); + + continueRewrites |= processWorklist(); // After applying patterns, make sure that the CFG of each of the // regions is kept up to date. diff --git a/mlir/lib/Transforms/Utils/InliningUtils.cpp b/mlir/lib/Transforms/Utils/InliningUtils.cpp index eeb40529cc2f..5ea31054051a 100644 --- a/mlir/lib/Transforms/Utils/InliningUtils.cpp +++ b/mlir/lib/Transforms/Utils/InliningUtils.cpp @@ -13,6 +13,7 @@ #include "mlir/Transforms/InliningUtils.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" #include "mlir/IR/IRMapping.h" #include "mlir/IR/Operation.h" #include "mlir/Interfaces/CallInterfaces.h" @@ -182,6 +183,11 @@ static bool isLegalToInline(InlinerInterface &interface, Region *src, IRMapping &valueMapping) { for (auto &block : *src) { for (auto &op : block) { + // UnrealizedConversionCastOp is inlineable but cannot implement the + // inliner interface due to layering constraints. + if (isa(op)) + continue; + // Check this operation. if (!interface.isLegalToInline(&op, insertRegion, shouldCloneInlinedRegion, valueMapping)) { diff --git a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp index ee5c642c943c..2111e2912056 100644 --- a/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp +++ b/mlir/lib/Transforms/Utils/WalkPatternRewriteDriver.cpp @@ -13,12 +13,14 @@ #include "mlir/Transforms/WalkPatternRewriteDriver.h" #include "mlir/IR/MLIRContext.h" +#include "mlir/IR/Operation.h" #include "mlir/IR/OperationSupport.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/Verifier.h" #include "mlir/IR/Visitors.h" #include "mlir/Rewrite/PatternApplicator.h" -#include "llvm/Support/Debug.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/DebugLog.h" #include "llvm/Support/ErrorHandling.h" #define DEBUG_TYPE "walk-rewriter" @@ -88,20 +90,97 @@ void walkAndApplyPatterns(Operation *op, PatternApplicator applicator(patterns); applicator.applyDefaultCostModel(); + // Iterator on all reachable operations in the region. + // Also keep track if we visited the nested regions of the current op + // already to drive the post-order traversal. + struct RegionReachableOpIterator { + RegionReachableOpIterator(Region *region) : region(region) { + regionIt = region->begin(); + if (regionIt != region->end()) + blockIt = regionIt->begin(); + } + // Advance the iterator to the next reachable operation. + void advance() { + assert(regionIt != region->end()); + hasVisitedRegions = false; + if (blockIt == regionIt->end()) { + ++regionIt; + if (regionIt != region->end()) + blockIt = regionIt->begin(); + return; + } + ++blockIt; + if (blockIt != regionIt->end()) { + LDBG() << "Incrementing block iterator, next op: " + << OpWithFlags(&*blockIt, OpPrintingFlags().skipRegions()); + } + } + // The region we're iterating over. + Region *region; + // The Block currently being iterated over. + Region::iterator regionIt; + // The Operation currently being iterated over. + Block::iterator blockIt; + // Whether we've visited the nested regions of the current op already. + bool hasVisitedRegions = false; + }; + + // Worklist of regions to visit to drive the post-order traversal. + SmallVector worklist; + + LDBG() << "Starting walk-based pattern rewrite driver"; ctx->executeAction( [&] { + // Perform a post-order traversal of the regions, visiting each + // reachable operation. for (Region ®ion : op->getRegions()) { - region.walk([&](Operation *visitedOp) { - LLVM_DEBUG(llvm::dbgs() << "Visiting op: "; visitedOp->print( - llvm::dbgs(), OpPrintingFlags().skipRegions()); - llvm::dbgs() << "\n";); -#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS - erasedListener.visitedOp = visitedOp; -#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS - if (succeeded(applicator.matchAndRewrite(visitedOp, rewriter))) { - LLVM_DEBUG(llvm::dbgs() << "\tOp matched and rewritten\n";); + assert(worklist.empty()); + if (region.empty()) + continue; + + // Prime the worklist with the entry block of this region. + worklist.push_back({®ion}); + while (!worklist.empty()) { + RegionReachableOpIterator &it = worklist.back(); + if (it.regionIt == it.region->end()) { + // We're done with this region. + worklist.pop_back(); + continue; } - }); + if (it.blockIt == it.regionIt->end()) { + // We're done with this block. + it.advance(); + continue; + } + Operation *op = &*it.blockIt; + // If we haven't visited the nested regions of this op yet, + // enqueue them. + if (!it.hasVisitedRegions) { + it.hasVisitedRegions = true; + for (Region &nestedRegion : llvm::reverse(op->getRegions())) { + if (nestedRegion.empty()) + continue; + worklist.push_back({&nestedRegion}); + } + } + // If we're not at the back of the worklist, we've enqueued some + // nested region for processing. We'll come back to this op later + // (post-order) + if (&it != &worklist.back()) + continue; + + // Preemptively increment the iterator, in case the current op + // would be erased. + it.advance(); + + LDBG() << "Visiting op: " + << OpWithFlags(op, OpPrintingFlags().skipRegions()); +#if MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + erasedListener.visitedOp = op; +#endif // MLIR_ENABLE_EXPENSIVE_PATTERN_API_CHECKS + if (succeeded(applicator.matchAndRewrite(op, rewriter))) + LDBG() << "\tOp matched and rewritten"; + } } }, {op}); diff --git a/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi b/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi index 58d453d2b2d3..4b82c7848929 100644 --- a/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi +++ b/mlir/python/mlir/_mlir_libs/_mlirExecutionEngine.pyi @@ -19,5 +19,6 @@ class ExecutionEngine: def dump_to_object_file(self, file_name: str) -> None: ... def raw_lookup(self, func_name: str) -> int: ... def raw_register_runtime(self, name: str, callback: object) -> None: ... + def init() -> None: ... @property def _CAPIPtr(self) -> object: ... diff --git a/mlir/test/CAPI/CMakeLists.txt b/mlir/test/CAPI/CMakeLists.txt index a7f9eb9b4efe..d45142510a49 100644 --- a/mlir/test/CAPI/CMakeLists.txt +++ b/mlir/test/CAPI/CMakeLists.txt @@ -30,6 +30,13 @@ if(MLIR_ENABLE_EXECUTION_ENGINE) MLIRCAPIConversion MLIRCAPIExecutionEngine MLIRCAPIRegisterEverything +) + _add_capi_test_executable(mlir-capi-global-constructors-test + global_constructors.c + LINK_LIBS PRIVATE + MLIRCAPIConversion + MLIRCAPIExecutionEngine + MLIRCAPIRegisterEverything ) endif() diff --git a/mlir/test/CAPI/global_constructors.c b/mlir/test/CAPI/global_constructors.c new file mode 100644 index 000000000000..bd2fe1416f0d --- /dev/null +++ b/mlir/test/CAPI/global_constructors.c @@ -0,0 +1,113 @@ +//===- global_constructors.c - Test JIT with the global constructors ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: target=aarch64{{.*}}, target=arm64{{.*}} +/* RUN: mlir-capi-global-constructors-test 2>&1 | FileCheck %s + */ +/* REQUIRES: host-supports-jit + */ + +#include "mlir-c/Conversion.h" +#include "mlir-c/ExecutionEngine.h" +#include "mlir-c/IR.h" +#include "mlir-c/RegisterEverything.h" + +#include +#include +#include +#include +#include + +static void registerAllUpstreamDialects(MlirContext ctx) { + MlirDialectRegistry registry = mlirDialectRegistryCreate(); + mlirRegisterAllDialects(registry); + mlirContextAppendDialectRegistry(ctx, registry); + mlirDialectRegistryDestroy(registry); +} + +void lowerModuleToLLVM(MlirContext ctx, MlirModule module) { + MlirPassManager pm = mlirPassManagerCreate(ctx); + MlirOpPassManager opm = mlirPassManagerGetNestedUnder( + pm, mlirStringRefCreateFromCString("func.func")); + mlirPassManagerAddOwnedPass(pm, mlirCreateConversionConvertFuncToLLVMPass()); + mlirOpPassManagerAddOwnedPass( + opm, mlirCreateConversionArithToLLVMConversionPass()); + MlirLogicalResult status = + mlirPassManagerRunOnOp(pm, mlirModuleGetOperation(module)); + if (mlirLogicalResultIsFailure(status)) { + fprintf(stderr, "Unexpected failure running pass pipeline\n"); + exit(2); + } + mlirPassManagerDestroy(pm); +} + +// Helper variable to track callback invocations +static int initCnt = 0; + +// Callback function that will be called during JIT initialization +static void initCallback(void) { initCnt += 1; } + +// CHECK-LABEL: Running test 'testGlobalCtorJitCallback' +void testGlobalCtorJitCallback(void) { + MlirContext ctx = mlirContextCreate(); + registerAllUpstreamDialects(ctx); + + // Create module with global constructor that calls our callback + MlirModule module = mlirModuleCreateParse( + ctx, mlirStringRefCreateFromCString( + // clang-format off +"module { \n" +" llvm.mlir.global_ctors ctors = [@ctor], priorities = [0 : i32], data = [#llvm.zero] \n" +" llvm.func @ctor() { \n" +" func.call @init_callback() : () -> () \n" +" llvm.return \n" +" } \n" +" func.func private @init_callback() attributes { llvm.emit_c_interface } \n" +"} \n" + // clang-format on + )); + + lowerModuleToLLVM(ctx, module); + mlirRegisterAllLLVMTranslations(ctx); + + // Create execution engine with initialization disabled + MlirExecutionEngine jit = mlirExecutionEngineCreate( + module, /*optLevel=*/2, /*numPaths=*/0, /*sharedLibPaths=*/NULL, + /*enableObjectDump=*/false); + + if (mlirExecutionEngineIsNull(jit)) { + fprintf(stderr, "Execution engine creation failed"); + exit(2); + } + + // Register callback symbol before initialization + mlirExecutionEngineRegisterSymbol( + jit, mlirStringRefCreateFromCString("_mlir_ciface_init_callback"), + (void *)(uintptr_t)initCallback); + + mlirExecutionEngineInitialize(jit); + + // CHECK: Init count: 1 + printf("Init count: %d\n", initCnt); + + mlirExecutionEngineDestroy(jit); + mlirModuleDestroy(module); + mlirContextDestroy(ctx); +} + +int main(void) { + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) +#define TEST(test) \ + printf("Running test '" STRINGIFY(test) "'\n"); \ + test(); + TEST(testGlobalCtorJitCallback); + return 0; +} diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 016d5e6f6e91..7736723ce2ae 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -141,6 +141,7 @@ if(LLVM_ENABLE_PIC AND TARGET ${LLVM_NATIVE_ARCH}) llc mlir_async_runtime mlir-capi-execution-engine-test + mlir-capi-global-constructors-test mlir_c_runner_utils mlir_runner_utils mlir_float16_utils diff --git a/mlir/test/Conversion/ArithToLLVM/type-conversion.mlir b/mlir/test/Conversion/ArithToLLVM/type-conversion.mlir new file mode 100644 index 000000000000..e3a0c82a628b --- /dev/null +++ b/mlir/test/Conversion/ArithToLLVM/type-conversion.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt %s -test-llvm-legalize-patterns -split-input-file | FileCheck %s +// RUN: mlir-opt %s -test-llvm-legalize-patterns="allow-pattern-rollback=0" -split-input-file | FileCheck %s + +// CHECK-LABEL: llvm.func @arith_select( +// CHECK-SAME: %[[arg0:.*]]: i1, %[[arg1:.*]]: i18, %[[arg2:.*]]: i18, %[[arg3:.*]]: i18, %[[arg4:.*]]: i18) -> !llvm.struct<(i18, i18)> +// CHECK: %[[select0:.*]] = llvm.select %[[arg0]], %[[arg1]], %[[arg3]] : i1, i18 +// CHECK: %[[select1:.*]] = llvm.select %[[arg0]], %[[arg2]], %[[arg4]] : i1, i18 +// CHECK: %[[i0:.*]] = llvm.mlir.poison : !llvm.struct<(i18, i18)> +// CHECK: %[[i1:.*]] = llvm.insertvalue %[[select0]], %[[i0]][0] : !llvm.struct<(i18, i18)> +// CHECK: %[[i2:.*]] = llvm.insertvalue %[[select1]], %[[i1]][1] : !llvm.struct<(i18, i18)> +// CHECK: llvm.return %[[i2]] +func.func @arith_select(%arg0: i1, %arg1: i17, %arg2: i17) -> (i17) { + %0 = arith.select %arg0, %arg1, %arg2 : i17 + return %0 : i17 +} diff --git a/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir b/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir index 0288aa11313c..6c6756f5097b 100644 --- a/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir +++ b/mlir/test/Conversion/MemRefToLLVM/type-conversion.mlir @@ -1,12 +1,13 @@ -// RUN: mlir-opt %s -test-llvm-legalize-patterns -split-input-file +// RUN: mlir-opt %s -test-llvm-legalize-patterns -split-input-file | FileCheck %s +// RUN: mlir-opt %s -test-llvm-legalize-patterns="allow-pattern-rollback=0" -split-input-file | FileCheck %s // Test the argument materializer for ranked MemRef types. // CHECK-LABEL: func @construct_ranked_memref_descriptor( -// CHECK: llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> +// CHECK: llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> // CHECK-COUNT-7: llvm.insertvalue // CHECK: builtin.unrealized_conversion_cast %{{.*}} : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> to memref<5x4xf32> -func.func @construct_ranked_memref_descriptor(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64) { +func.func @construct_ranked_memref_descriptor(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64) attributes {is_legal} { %0 = "test.direct_replacement"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6) : (!llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64) -> (memref<5x4xf32>) "test.legal_op"(%0) : (memref<5x4xf32>) -> () return @@ -21,7 +22,7 @@ func.func @construct_ranked_memref_descriptor(%arg0: !llvm.ptr, %arg1: !llvm.ptr // CHECK-LABEL: func @invalid_ranked_memref_descriptor( // CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %{{.*}} : i1 to memref<5x4xf32> // CHECK: "test.legal_op"(%[[cast]]) -func.func @invalid_ranked_memref_descriptor(%arg0: i1) { +func.func @invalid_ranked_memref_descriptor(%arg0: i1) attributes {is_legal} { %0 = "test.direct_replacement"(%arg0) : (i1) -> (memref<5x4xf32>) "test.legal_op"(%0) : (memref<5x4xf32>) -> () return @@ -32,10 +33,10 @@ func.func @invalid_ranked_memref_descriptor(%arg0: i1) { // Test the argument materializer for unranked MemRef types. // CHECK-LABEL: func @construct_unranked_memref_descriptor( -// CHECK: llvm.mlir.undef : !llvm.struct<(i64, ptr)> +// CHECK: llvm.mlir.poison : !llvm.struct<(i64, ptr)> // CHECK-COUNT-2: llvm.insertvalue // CHECK: builtin.unrealized_conversion_cast %{{.*}} : !llvm.struct<(i64, ptr)> to memref<*xf32> -func.func @construct_unranked_memref_descriptor(%arg0: i64, %arg1: !llvm.ptr) { +func.func @construct_unranked_memref_descriptor(%arg0: i64, %arg1: !llvm.ptr) attributes {is_legal} { %0 = "test.direct_replacement"(%arg0, %arg1) : (i64, !llvm.ptr) -> (memref<*xf32>) "test.legal_op"(%0) : (memref<*xf32>) -> () return @@ -50,8 +51,107 @@ func.func @construct_unranked_memref_descriptor(%arg0: i64, %arg1: !llvm.ptr) { // CHECK-LABEL: func @invalid_unranked_memref_descriptor( // CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %{{.*}} : i1 to memref<*xf32> // CHECK: "test.legal_op"(%[[cast]]) -func.func @invalid_unranked_memref_descriptor(%arg0: i1) { +func.func @invalid_unranked_memref_descriptor(%arg0: i1) attributes {is_legal} { %0 = "test.direct_replacement"(%arg0) : (i1) -> (memref<*xf32>) "test.legal_op"(%0) : (memref<*xf32>) -> () return } + +// ----- + +// CHECK-LABEL: llvm.func @simple_func_conversion( +// CHECK-SAME: %[[arg0:.*]]: i64) -> i64 +// CHECK: llvm.return %[[arg0]] : i64 +func.func @simple_func_conversion(%arg0: i64) -> i64 { + return %arg0 : i64 +} + +// ----- + +// CHECK-LABEL: llvm.func @one_to_n_argument_conversion( +// CHECK-SAME: %[[arg0:.*]]: i18, %[[arg1:.*]]: i18) +// CHECK: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[arg0]], %[[arg1]] : i18, i18 to i17 +// CHECK: "test.legal_op"(%[[cast]]) : (i17) -> () +func.func @one_to_n_argument_conversion(%arg0: i17) { + "test.legal_op"(%arg0) : (i17) -> () + return +} + +// CHECK: llvm.func @caller(%[[arg0:.*]]: i18, %[[arg1:.*]]: i18) +// CHECK: llvm.call @one_to_n_argument_conversion(%[[arg0]], %[[arg1]]) : (i18, i18) -> () +func.func @caller(%arg0: i17) { + func.call @one_to_n_argument_conversion(%arg0) : (i17) -> () + return +} + +// ----- + +// CHECK-LABEL: llvm.func @one_to_n_return_conversion( +// CHECK-SAME: %[[arg0:.*]]: i18, %[[arg1:.*]]: i18) -> !llvm.struct<(i18, i18)> +// CHECK: %[[p1:.*]] = llvm.mlir.poison : !llvm.struct<(i18, i18)> +// CHECK: %[[p2:.*]] = llvm.insertvalue %[[arg0]], %[[p1]][0] : !llvm.struct<(i18, i18)> +// CHECK: %[[p3:.*]] = llvm.insertvalue %[[arg1]], %[[p2]][1] : !llvm.struct<(i18, i18)> +// CHECK: llvm.return %[[p3]] +func.func @one_to_n_return_conversion(%arg0: i17) -> i17 { + return %arg0 : i17 +} + +// CHECK: llvm.func @caller(%[[arg0:.*]]: i18, %[[arg1:.*]]: i18) +// CHECK: %[[res:.*]] = llvm.call @one_to_n_return_conversion(%[[arg0]], %[[arg1]]) : (i18, i18) -> !llvm.struct<(i18, i18)> +// CHECK: %[[e0:.*]] = llvm.extractvalue %[[res]][0] : !llvm.struct<(i18, i18)> +// CHECK: %[[e1:.*]] = llvm.extractvalue %[[res]][1] : !llvm.struct<(i18, i18)> +// CHECK: %[[i0:.*]] = llvm.mlir.poison : !llvm.struct<(i18, i18)> +// CHECK: %[[i1:.*]] = llvm.insertvalue %[[e0]], %[[i0]][0] : !llvm.struct<(i18, i18)> +// CHECK: %[[i2:.*]] = llvm.insertvalue %[[e1]], %[[i1]][1] : !llvm.struct<(i18, i18)> +// CHECK: llvm.return %[[i2]] +func.func @caller(%arg0: i17) -> (i17) { + %res = func.call @one_to_n_return_conversion(%arg0) : (i17) -> (i17) + return %res : i17 +} + +// ----- + +// CHECK-LABEL: llvm.func @multi_return( +// CHECK-SAME: %[[arg0:.*]]: i18, %[[arg1:.*]]: i18, %[[arg2:.*]]: i1) -> !llvm.struct<(i18, i18, i1)> +// CHECK: %[[p1:.*]] = llvm.mlir.poison : !llvm.struct<(i18, i18, i1)> +// CHECK: %[[p2:.*]] = llvm.insertvalue %[[arg0]], %[[p1]][0] : !llvm.struct<(i18, i18, i1)> +// CHECK: %[[p3:.*]] = llvm.insertvalue %[[arg1]], %[[p2]][1] : !llvm.struct<(i18, i18, i1)> +// CHECK: %[[p4:.*]] = llvm.insertvalue %[[arg2]], %[[p3]][2] : !llvm.struct<(i18, i18, i1)> +// CHECK: llvm.return %[[p4]] +func.func @multi_return(%arg0: i17, %arg1: i1) -> (i17, i1) { + return %arg0, %arg1 : i17, i1 +} + +// CHECK: llvm.func @caller(%[[arg0:.*]]: i1, %[[arg1:.*]]: i18, %[[arg2:.*]]: i18) +// CHECK: %[[res:.*]] = llvm.call @multi_return(%[[arg1]], %[[arg2]], %[[arg0]]) : (i18, i18, i1) -> !llvm.struct<(i18, i18, i1)> +// CHECK: %[[e0:.*]] = llvm.extractvalue %[[res]][0] : !llvm.struct<(i18, i18, i1)> +// CHECK: %[[e1:.*]] = llvm.extractvalue %[[res]][1] : !llvm.struct<(i18, i18, i1)> +// CHECK: %[[e2:.*]] = llvm.extractvalue %[[res]][2] : !llvm.struct<(i18, i18, i1)> +// CHECK: %[[i0:.*]] = llvm.mlir.poison : !llvm.struct<(i18, i18, i1, i18, i18)> +// CHECK: %[[i1:.*]] = llvm.insertvalue %[[e0]], %[[i0]][0] +// CHECK: %[[i2:.*]] = llvm.insertvalue %[[e1]], %[[i1]][1] +// CHECK: %[[i3:.*]] = llvm.insertvalue %[[e2]], %[[i2]][2] +// CHECK: %[[i4:.*]] = llvm.insertvalue %[[e0]], %[[i3]][3] +// CHECK: %[[i5:.*]] = llvm.insertvalue %[[e1]], %[[i4]][4] +// CHECK: llvm.return %[[i5]] +func.func @caller(%arg0: i1, %arg1: i17) -> (i17, i1, i17) { + %res:2 = func.call @multi_return(%arg1, %arg0) : (i17, i1) -> (i17, i1) + return %res#0, %res#1, %res#0 : i17, i1, i17 +} + +// ----- + +// CHECK-LABEL: llvm.func @branch( +// CHECK-SAME: %[[arg0:.*]]: i1, %[[arg1:.*]]: i18, %[[arg2:.*]]: i18) +// CHECK: llvm.br ^[[bb1:.*]](%[[arg1]], %[[arg2]], %[[arg0]] : i18, i18, i1) +// CHECK: ^[[bb1]](%[[arg3:.*]]: i18, %[[arg4:.*]]: i18, %[[arg5:.*]]: i1): +// CHECK: llvm.cond_br %[[arg5]], ^[[bb1]](%[[arg1]], %[[arg2]], %[[arg5]] : i18, i18, i1), ^[[bb2:.*]](%[[arg3]], %[[arg4]] : i18, i18) +// CHECK: ^bb2(%{{.*}}: i18, %{{.*}}: i18): +// CHECK: llvm.return +func.func @branch(%arg0: i1, %arg1: i17) { + cf.br ^bb1(%arg1, %arg0: i17, i1) +^bb1(%arg2: i17, %arg3: i1): + cf.cond_br %arg3, ^bb1(%arg1, %arg3 : i17, i1), ^bb2(%arg2 : i17) +^bb2(%arg4: i17): + return +} diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir index 24873340d712..b38347c7cd1b 100644 --- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir +++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir @@ -683,6 +683,18 @@ llvm.func @ex2(%input : f32, %pred : i1) { llvm.return } +// CHECK-LABEL: @multi_return( +// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: i32, %[[arg1:[a-zA-Z0-9_]+]]: i32) +llvm.func @multi_return(%a : i32, %b : i32) -> i32 { + // CHECK: %[[S1:.+]] = llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09 .reg .pred p;\0A\09 setp.ge.s32 p, $2, $3;\0A\09 selp.s32 $0, $2, $3, p;\0A\09 selp.s32 $1, $2, $3, !p;\0A\09}\0A", "=r,=r,r,r" %[[arg0]], %[[arg1]] : (i32, i32) -> !llvm.struct<(i32, i32)> + // CHECK: %[[S2:.+]] = llvm.extractvalue %[[S1]][0] : !llvm.struct<(i32, i32)> + // CHECK: %[[S3:.+]] = llvm.extractvalue %[[S1]][1] : !llvm.struct<(i32, i32)> + // CHECK: %[[S4:.+]] = llvm.add %[[S2]], %[[S3]] : i32 + // CHECK: llvm.return %[[S4]] : i32 + %r1, %r2 = nvvm.inline_ptx "{\n\t .reg .pred p;\n\t setp.ge.s32 p, $2, $3;\n\t selp.s32 $0, $2, $3, p;\n\t selp.s32 $1, $2, $3, !p;\n\t}\n" (%a, %b) : i32,i32 -> i32,i32 + %r3 = llvm.add %r1, %r2 : i32 + llvm.return %r3 : i32 +} // ----- // CHECK-LABEL: @nvvm_pmevent diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir index 78f67821da13..ca3de3a2d770 100644 --- a/mlir/test/Dialect/Arith/canonicalize.mlir +++ b/mlir/test/Dialect/Arith/canonicalize.mlir @@ -3363,3 +3363,18 @@ func.func @bf16_fma(%arg0: vector<32x32x32xbf16>, %arg1: vector<32x32x32xbf16>, } } #-} + +// CHECK-LABEL: func @unreachable() +// CHECK-NEXT: return +// CHECK-NOT: arith +func.func @unreachable() { + return +^unreachable: + %c1_i64 = arith.constant 1 : i64 + // This self referencing operation is legal in an unreachable block. + // Many patterns are unsafe with respect to this kind of situation, + // check that we don't infinite loop here. + %add = arith.addi %add, %c1_i64 : i64 + cf.br ^unreachable +} + diff --git a/mlir/test/Dialect/Arith/int-range-interface.mlir b/mlir/test/Dialect/Arith/int-range-interface.mlir index 2128d36f1a28..130782ba9f52 100644 --- a/mlir/test/Dialect/Arith/int-range-interface.mlir +++ b/mlir/test/Dialect/Arith/int-range-interface.mlir @@ -224,6 +224,15 @@ func.func @ceil_divui(%arg0 : index) -> i1 { func.return %7 : i1 } +// CHECK-LABEL: func @ceil_divui_by_zero_issue_131273 +// CHECK-NEXT: return +func.func @ceil_divui_by_zero_issue_131273() { + %0 = test.with_bounds {smax = 0 : i32, smin = -1 : i32, umax = 0 : i32, umin = -1 : i32} : i32 + %c7_i32 = arith.constant 7 : i32 + %1 = arith.ceildivui %c7_i32, %0 : i32 + return +} + // CHECK-LABEL: func @ceil_divsi // CHECK: %[[ret:.*]] = arith.cmpi eq // CHECK: return %[[ret]] diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 5b251517d2ef..93a5a055b08c 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -762,3 +762,89 @@ func.func @slice_attr_repeat_dim() { return } +// ----- +func.func @create_mem_desc_non_slm() { + %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 1> + // expected-error@+1 {{operand #0 must be statically shaped memref of 8-bit signless integer values for shared memory}} + %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 1> -> !xegpu.mem_desc<16x64xf16> + return +} + +// ----- +func.func @create_mem_desc_mismatch_sizes() { + %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3> + // expected-error@+1 {{failed to verify that all of {source, mem_desc} have same size in bits}} + %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 3> -> !xegpu.mem_desc<16x32xf16> + return +} + +// ----- +func.func @load_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{failed to verify that all of {mem_desc, res} have same element type}} + %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf32> + return +} + +// ----- +func.func @load_mem_desc_invalid_result_size(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{result shape must not exceed mem_desc shape}} + %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<32x16xf16> + return +} + +// ----- +func.func @load_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>) { + // expected-error@+1 {{mem_desc must be 2D}} + %data = xegpu.load_matrix %arg0[16]: !xegpu.mem_desc<64xf16> -> vector<16xf16> + return +} + +// ----- +func.func @store_mem_desc_mismatch_element_type(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf32>) { + // expected-error@+1 {{failed to verify that all of {mem_desc, data} have same element type}} + xegpu.store_matrix %arg1, %arg0[8, 8] : vector<16x16xf32>, !xegpu.mem_desc<16x64xf16> + return +} + +// ----- +func.func @store_mem_desc_invalid_data_size(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<32x32xf16>) { + // expected-error@+1 {{data shape must not exceed mem_desc shape}} + xegpu.store_matrix %arg1, %arg0[8, 8] : vector<32x32xf16>, !xegpu.mem_desc<16x64xf16> + return +} + +// ----- +func.func @store_mem_desc_invalid_rank(%arg0: !xegpu.mem_desc<64xf16>, %arg1: vector<32xf16>) { + // expected-error@+1 {{mem_desc must be 2D.}} + xegpu.store_matrix %arg1, %arg0[32] : vector<32xf16>, !xegpu.mem_desc<64xf16> + return +} + +// ----- +func.func @mem_desc_subview_size_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{result shape must not exceed source shape}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<32x16xf16> + return +} + +// ----- +func.func @mem_desc_subview_layout_mismatch(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>) { + // expected-error@+1 {{result must inherit the source strides}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> -> !xegpu.mem_desc<8x16xf16> + return +} + +// ----- +func.func @mem_desc_subview_element_type_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{failed to verify that all of {src, res} have same element type}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf32, #xegpu.mem_layout> + return +} + +// ----- +func.func @mem_desc_subview_rank_mismatch(%arg0: !xegpu.mem_desc<16x64xf16>) { + // expected-error@+1 {{result rank must not exceed source rank}} + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<4x8x16xf16> + return +} + diff --git a/mlir/test/Dialect/XeGPU/ops.mlir b/mlir/test/Dialect/XeGPU/ops.mlir index 67c00f5a9cc2..35342eca1354 100644 --- a/mlir/test/Dialect/XeGPU/ops.mlir +++ b/mlir/test/Dialect/XeGPU/ops.mlir @@ -751,4 +751,72 @@ gpu.func @fence() { gpu.return } +// CHECK-LABEL: gpu.func @create_mem_desc({{.*}}) { +gpu.func @create_mem_desc() { + //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3> + //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16> + %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3> + %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16> + gpu.return +} + +// CHECK-LABEL: gpu.func @create_mem_desc_with_stride({{.*}}) { +gpu.func @create_mem_desc_with_stride() { + //CHECK: [[alloc:%.+]] = memref.alloca() {alignment = 1024 : i64} : memref<2048xi8, 3> + //CHECK: [[mdesc:%.+]] = xegpu.create_mem_desc [[alloc]] : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> + %m = memref.alloca() {alignment = 1024} : memref<2048xi8, 3> + %mem_desc = xegpu.create_mem_desc %m : memref<2048xi8, 3> -> !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> + gpu.return +} + +// CHECK: gpu.func @load_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) +gpu.func @load_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>) { + // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16> + %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> vector<8x16xf16> + gpu.return +} + +// CHECK: gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>) +gpu.func @load_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>) { + // CHECK: xegpu.load_matrix [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> -> vector<8x16xf16> + %data = xegpu.load_matrix %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> -> vector<8x16xf16> + gpu.return +} + + +// CHECK: gpu.func @store_mem_desc([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>, [[ARG1:%.+]]: vector<16x16xf16>) +gpu.func @store_mem_desc(%arg0: !xegpu.mem_desc<16x64xf16>, %arg1: vector<16x16xf16>) { + // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][8, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> + xegpu.store_matrix %arg1, %arg0[8, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16> + gpu.return +} + +// CHECK: gpu.func @store_mem_desc_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>, [[ARG1:%.+]]: vector<16x16xf16>) +gpu.func @store_mem_desc_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>, %arg1: vector<16x16xf16>) { + // CHECK: xegpu.store_matrix [[ARG1]], [[ARG0]][0, 8] : vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> + xegpu.store_matrix %arg1, %arg0[0, 8]: vector<16x16xf16>, !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> + gpu.return +} + +// CHECK: gpu.func @mem_desc_subview([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) +gpu.func @mem_desc_subview(%arg0: !xegpu.mem_desc<16x64xf16>) { + //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout> + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout> + gpu.return +} + +// CHECK: gpu.func @mem_desc_subview_lower_rank([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16>) +gpu.func @mem_desc_subview_lower_rank(%arg0: !xegpu.mem_desc<16x64xf16>) { + //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout> + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16> -> !xegpu.mem_desc<16xf16, #xegpu.mem_layout> + gpu.return +} + +// CHECK: gpu.func @mem_desc_subview_with_stride([[ARG0:%.+]]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>) +gpu.func @mem_desc_subview_with_stride(%arg0: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout>) { + //CHECK: xegpu.mem_desc_subview [[ARG0]][8, 8] : !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout> + %data = xegpu.mem_desc_subview %arg0[8, 8]: !xegpu.mem_desc<16x64xf16, #xegpu.mem_layout> -> !xegpu.mem_desc<8x16xf16, #xegpu.mem_layout> + gpu.return +} + } diff --git a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir index 02f7e60671c9..c75c478ec373 100644 --- a/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir +++ b/mlir/test/IR/test-walk-pattern-rewrite-driver.mlir @@ -40,12 +40,12 @@ func.func @move_before(%cond : i1) { } // Check that the driver handles rewriter.moveAfter. In this case, we expect -// the moved op to be visited only once since walk uses `make_early_inc_range`. +// the moved op to be visited twice. // CHECK-LABEL: func.func @move_after( // CHECK: scf.if // CHECK: } // CHECK: "test.move_after_parent_op" -// CHECK: "test.any_attr_of_i32_str"() <{attr = 1 : i32}> : () -> () +// CHECK: "test.any_attr_of_i32_str"() <{attr = 2 : i32}> : () -> () // CHECK: return func.func @move_after(%cond : i1) { scf.if %cond { diff --git a/mlir/test/Target/LLVMIR/ompenmp-target-allocmem-freemem.mlir b/mlir/test/Target/LLVMIR/ompenmp-target-allocmem-freemem.mlir new file mode 100644 index 000000000000..1bc97609ccff --- /dev/null +++ b/mlir/test/Target/LLVMIR/ompenmp-target-allocmem-freemem.mlir @@ -0,0 +1,42 @@ +// RUN: mlir-opt %s -convert-openmp-to-llvm | mlir-translate -mlir-to-llvmir | FileCheck %s + +// This file contains MLIR test cases for omp.target_allocmem and omp.target_freemem + +// CHECK-LABEL: test_alloc_free_i64 +// CHECK: %[[ALLOC:.*]] = call ptr @omp_target_alloc(i64 8, i32 0) +// CHECK: %[[PTRTOINT:.*]] = ptrtoint ptr %[[ALLOC]] to i64 +// CHECK: %[[INTTOPTR:.*]] = inttoptr i64 %[[PTRTOINT]] to ptr +// CHECK: call void @omp_target_free(ptr %[[INTTOPTR]], i32 0) +// CHECK: ret void +llvm.func @test_alloc_free_i64() -> () { + %device = llvm.mlir.constant(0 : i32) : i32 + %1 = omp.target_allocmem %device : i32, i64 + omp.target_freemem %device, %1 : i32, i64 + llvm.return +} + +// CHECK-LABEL: test_alloc_free_vector_1d_f32 +// CHECK: %[[ALLOC:.*]] = call ptr @omp_target_alloc(i64 64, i32 0) +// CHECK: %[[PTRTOINT:.*]] = ptrtoint ptr %[[ALLOC]] to i64 +// CHECK: %[[INTTOPTR:.*]] = inttoptr i64 %[[PTRTOINT]] to ptr +// CHECK: call void @omp_target_free(ptr %[[INTTOPTR]], i32 0) +// CHECK: ret void +llvm.func @test_alloc_free_vector_1d_f32() -> () { + %device = llvm.mlir.constant(0 : i32) : i32 + %1 = omp.target_allocmem %device : i32, vector<16xf32> + omp.target_freemem %device, %1 : i32, i64 + llvm.return +} + +// CHECK-LABEL: test_alloc_free_vector_2d_f32 +// CHECK: %[[ALLOC:.*]] = call ptr @omp_target_alloc(i64 1024, i32 0) +// CHECK: %[[PTRTOINT:.*]] = ptrtoint ptr %[[ALLOC]] to i64 +// CHECK: %[[INTTOPTR:.*]] = inttoptr i64 %[[PTRTOINT]] to ptr +// CHECK: call void @omp_target_free(ptr %[[INTTOPTR]], i32 0) +// CHECK: ret void +llvm.func @test_alloc_free_vector_2d_f32() -> () { + %device = llvm.mlir.constant(0 : i32) : i32 + %1 = omp.target_allocmem %device : i32, vector<16x16xf32> + omp.target_freemem %device, %1 : i32, i64 + llvm.return +} diff --git a/mlir/test/Transforms/inlining.mlir b/mlir/test/Transforms/inlining.mlir index 1ed08878430b..d8e10aa4212b 100644 --- a/mlir/test/Transforms/inlining.mlir +++ b/mlir/test/Transforms/inlining.mlir @@ -5,14 +5,18 @@ // RUN: mlir-opt %s -inline='op-pipelines=func.func(canonicalize,cse)' | FileCheck %s --check-prefix INLINE_SIMPLIFY // Inline a function that takes an argument. -func.func @func_with_arg(%c : i32) -> i32 { - %b = arith.addi %c, %c : i32 - return %b : i32 +func.func @func_with_arg(%arg0 : i32) -> i32 { + %b = arith.addi %arg0, %arg0 : i32 + %c = builtin.unrealized_conversion_cast %b : i32 to i64 + %d = builtin.unrealized_conversion_cast %c : i64 to i32 + return %d : i32 } // CHECK-LABEL: func @inline_with_arg func.func @inline_with_arg(%arg0 : i32) -> i32 { // CHECK-NEXT: arith.addi + // CHECK-NEXT: unrealized_conversion_cast + // CHECK-NEXT: unrealized_conversion_cast // CHECK-NEXT: return %0 = call @func_with_arg(%arg0) : (i32) -> i32 diff --git a/mlir/test/Transforms/test-canonicalize.mlir b/mlir/test/Transforms/test-canonicalize.mlir index 0fc822b0a23a..8cad6b98441d 100644 --- a/mlir/test/Transforms/test-canonicalize.mlir +++ b/mlir/test/Transforms/test-canonicalize.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s +// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize))' | FileCheck %s --check-prefixes=CHECK,RS // RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(canonicalize{region-simplify=disabled}))' | FileCheck %s --check-prefixes=CHECK,NO-RS // CHECK-LABEL: func @remove_op_with_inner_ops_pattern @@ -80,12 +80,10 @@ func.func @test_dialect_canonicalizer() -> (i32) { // Check that the option to control region simplification actually works // CHECK-LABEL: test_region_simplify -func.func @test_region_simplify() { - // CHECK-NEXT: return - // NO-RS-NEXT: ^bb1 - // NO-RS-NEXT: return - // CHECK-NEXT: } - return -^bb1: - return +func.func @test_region_simplify(%input1 : i32, %cond : i1) -> i32 { + // RS-NEXT: "test.br"(%arg0)[^bb1] : (i32) -> () + // NO-RS-NEXT: "test.br"(%arg0, %arg0)[^bb1] : (i32, i32) -> () + "test.br"(%input1, %input1)[^bb1] : (i32, i32) -> () +^bb1(%used_arg : i32, %unused_arg : i32): + return %used_arg : i32 } diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 55d153db7f4b..3fa42ff6b275 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -1,6 +1,7 @@ // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics %s | FileCheck %s // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=1" -verify-diagnostics -profile-actions-to=- %s | FileCheck %s --check-prefix=CHECK-PROFILER // RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0" -verify-diagnostics %s | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect -split-input-file -test-legalize-patterns="allow-pattern-rollback=0 build-materializations=0 attach-debug-materialization-kind=1" -verify-diagnostics %s | FileCheck %s --check-prefix=CHECK-KIND // CHECK-PROFILER: "name": "pass-execution", "cat": "PERF", "ph": "B" // CHECK-PROFILER: "name": "apply-conversion", "cat": "PERF", "ph": "B" @@ -190,9 +191,12 @@ func.func @remap_drop_region() { // ----- // CHECK-LABEL: func @dropped_input_in_use +// CHECK-KIND-LABEL: func @dropped_input_in_use func.func @dropped_input_in_use(%arg: i16, %arg2: i64) { - // CHECK-NEXT: "test.cast"{{.*}} : () -> i16 - // CHECK-NEXT: "work"{{.*}} : (i16) + // CHECK-NEXT: %[[cast:.*]] = "test.cast"() : () -> i16 + // CHECK-NEXT: "work"(%[[cast]]) : (i16) + // CHECK-KIND-NEXT: %[[cast:.*]] = builtin.unrealized_conversion_cast to i16 {__kind__ = "source"} + // CHECK-KIND-NEXT: "work"(%[[cast]]) : (i16) // expected-remark@+1 {{op 'work' is not legalizable}} "work"(%arg) : (i16) -> () } @@ -430,6 +434,11 @@ func.func @test_multiple_1_to_n_replacement() { // CHECK: %[[cast:.*]] = "test.cast"(%[[producer]]) : (i16) -> f64 // CHECK: "test.valid_consumer"(%[[cast]]) : (f64) -> () // CHECK: "test.valid_consumer"(%[[producer]]) : (i16) -> () +// CHECK-KIND-LABEL: func @test_lookup_without_converter +// CHECK-KIND: %[[producer:.*]] = "test.valid_producer"() : () -> i16 +// CHECK-KIND: %[[cast:.*]] = builtin.unrealized_conversion_cast %[[producer]] : i16 to f64 {__kind__ = "target"} +// CHECK-KIND: "test.valid_consumer"(%[[cast]]) : (f64) -> () +// CHECK-KIND: "test.valid_consumer"(%[[producer]]) : (i16) -> () func.func @test_lookup_without_converter() { %0 = "test.replace_with_valid_producer"() {type = i16} : () -> (i64) "test.replace_with_valid_consumer"(%0) {with_converter} : (i64) -> () diff --git a/mlir/test/lib/Dialect/LLVM/TestPatterns.cpp b/mlir/test/lib/Dialect/LLVM/TestPatterns.cpp index ab02866970b1..69a3d98bc09e 100644 --- a/mlir/test/lib/Dialect/LLVM/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/LLVM/TestPatterns.cpp @@ -6,7 +6,11 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/LLVMTypes.h" #include "mlir/Pass/Pass.h" @@ -34,6 +38,10 @@ struct TestLLVMLegalizePatternsPass : public PassWrapper> { MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestLLVMLegalizePatternsPass) + TestLLVMLegalizePatternsPass() = default; + TestLLVMLegalizePatternsPass(const TestLLVMLegalizePatternsPass &other) + : PassWrapper(other) {} + StringRef getArgument() const final { return "test-llvm-legalize-patterns"; } StringRef getDescription() const final { return "Run LLVM dialect legalization patterns"; @@ -45,22 +53,48 @@ struct TestLLVMLegalizePatternsPass void runOnOperation() override { MLIRContext *ctx = &getContext(); + + // Set up type converter. LLVMTypeConverter converter(ctx); + converter.addConversion( + [&](IntegerType type, SmallVectorImpl &result) { + if (type.isInteger(17)) { + // Convert i17 -> (i18, i18). + result.append(2, Builder(ctx).getIntegerType(18)); + return success(); + } + + result.push_back(type); + return success(); + }); + + // Populate patterns. mlir::RewritePatternSet patterns(ctx); patterns.add(ctx, converter); + arith::populateArithToLLVMConversionPatterns(converter, patterns); + populateFuncToLLVMConversionPatterns(converter, patterns); + cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); // Define the conversion target used for the test. ConversionTarget target(*ctx); target.addLegalOp(OperationName("test.legal_op", ctx)); + target.addLegalDialect(); + target.addDynamicallyLegalOp( + [&](func::FuncOp funcOp) { return funcOp->hasAttr("is_legal"); }); // Handle a partial conversion. DenseSet unlegalizedOps; ConversionConfig config; config.unlegalizedOps = &unlegalizedOps; + config.allowPatternRollback = allowPatternRollback; if (failed(applyPartialConversion(getOperation(), target, std::move(patterns), config))) getOperation()->emitError() << "applyPartialConversion failed"; } + + Option allowPatternRollback{*this, "allow-pattern-rollback", + llvm::cl::desc("Allow pattern rollback"), + llvm::cl::init(true)}; }; } // namespace diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp index 6300c5b0ca21..b6f16ac1b5c4 100644 --- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp +++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp @@ -1574,15 +1574,19 @@ struct TestLegalizePatternDriver target.addDynamicallyLegalOp( [](ConvertBlockArgsOp op) { return op.getIsLegal(); }); + // Set up configuration. + ConversionConfig config; + config.allowPatternRollback = allowPatternRollback; + config.foldingMode = foldingMode; + config.buildMaterializations = buildMaterializations; + config.attachDebugMaterializationKind = attachDebugMaterializationKind; + DumpNotifications dumpNotifications; + config.listener = &dumpNotifications; + // Handle a partial conversion. if (mode == ConversionMode::Partial) { DenseSet unlegalizedOps; - ConversionConfig config; - config.allowPatternRollback = allowPatternRollback; - DumpNotifications dumpNotifications; - config.listener = &dumpNotifications; config.unlegalizedOps = &unlegalizedOps; - config.foldingMode = foldingMode; if (failed(applyPartialConversion(getOperation(), target, std::move(patterns), config))) { getOperation()->emitRemark() << "applyPartialConversion failed"; @@ -1600,11 +1604,6 @@ struct TestLegalizePatternDriver return (bool)op->getAttrOfType("test.dynamically_legal"); }); - ConversionConfig config; - config.allowPatternRollback = allowPatternRollback; - DumpNotifications dumpNotifications; - config.foldingMode = foldingMode; - config.listener = &dumpNotifications; if (failed(applyFullConversion(getOperation(), target, std::move(patterns), config))) { getOperation()->emitRemark() << "applyFullConversion failed"; @@ -1617,9 +1616,6 @@ struct TestLegalizePatternDriver // Analyze the convertible operations. DenseSet legalizedOps; - ConversionConfig config; - config.foldingMode = foldingMode; - config.allowPatternRollback = allowPatternRollback; config.legalizableOps = &legalizedOps; if (failed(applyAnalysisConversion(getOperation(), target, std::move(patterns), config))) @@ -1658,6 +1654,16 @@ struct TestLegalizePatternDriver Option allowPatternRollback{*this, "allow-pattern-rollback", llvm::cl::desc("Allow pattern rollback"), llvm::cl::init(true)}; + Option attachDebugMaterializationKind{ + *this, "attach-debug-materialization-kind", + llvm::cl::desc( + "Attach materialization kind to unrealized_conversion_cast ops"), + llvm::cl::init(false)}; + Option buildMaterializations{ + *this, "build-materializations", + llvm::cl::desc( + "If set to 'false', leave unrealized_conversion_cast ops in place"), + llvm::cl::init(true)}; }; } // namespace diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp index 3bea8efcdb0a..58962714b786 100644 --- a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp +++ b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp @@ -228,7 +228,7 @@ struct TestXeGPULayoutInterface auto materializeCast = [&](mlir::OpBuilder &builder, mlir::Type type, mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value { - return builder.create(loc, type, inputs) + return UnrealizedConversionCastOp::create(builder, loc, type, inputs) .getResult(0); }; typeConverter.addSourceMaterialization(materializeCast); diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index ba7eeeed8ef3..5e9347d784b3 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -190,6 +190,7 @@ tools = [ "mlir-translate", "mlir-lsp-server", "mlir-capi-execution-engine-test", + "mlir-capi-global-constructors-test", "mlir-capi-ir-test", "mlir-capi-irdl-test", "mlir-capi-llvm-test", diff --git a/mlir/test/python/global_constructors.py b/mlir/test/python/global_constructors.py new file mode 100644 index 000000000000..5020c00344a3 --- /dev/null +++ b/mlir/test/python/global_constructors.py @@ -0,0 +1,72 @@ +# UNSUPPORTED: target=aarch64{{.*}}, target=arm64{{.*}} +# RUN: %PYTHON %s 2>&1 | FileCheck %s +# REQUIRES: host-supports-jit +import gc, sys, os, tempfile +from mlir.ir import * +from mlir.passmanager import * +from mlir.execution_engine import * +from mlir.runtime import * + + +# Log everything to stderr and flush so that we have a unified stream to match +# errors/info emitted by MLIR to stderr. +def log(*args): + print(*args, file=sys.stderr) + sys.stderr.flush() + + +def run(f): + log("\nTEST:", f.__name__) + f() + gc.collect() + assert Context._get_live_count() == 0 + + +def lowerToLLVM(module): + pm = PassManager.parse( + "builtin.module(convert-func-to-llvm,reconcile-unrealized-casts)" + ) + pm.run(module.operation) + return module + + +# Test JIT callback in global constructor +# CHECK-LABEL: TEST: testJITCallbackInGlobalCtor +def testJITCallbackInGlobalCtor(): + init_cnt = 0 + + @ctypes.CFUNCTYPE(None) + def initCallback(): + nonlocal init_cnt + init_cnt += 1 + + with Context(): + module = Module.parse( + r""" +llvm.mlir.global_ctors ctors = [@ctor], priorities = [0 : i32], data = [#llvm.zero] +llvm.func @ctor() { + func.call @init_callback() : () -> () + llvm.return +} +func.func private @init_callback() attributes { llvm.emit_c_interface } + """ + ) + + # Setup execution engine + execution_engine = ExecutionEngine(lowerToLLVM(module)) + + # Validate initialization hasn't run yet + assert init_cnt == 0 + + # # Register callback + execution_engine.register_runtime("init_callback", initCallback) + + # # Initialize and verify + execution_engine.initialize() + assert init_cnt == 1 + # # Second initialization should be no-op + execution_engine.initialize() + assert init_cnt == 1 + + +run(testJITCallbackInGlobalCtor) diff --git a/mlir/unittests/ExecutionEngine/Invoke.cpp b/mlir/unittests/ExecutionEngine/Invoke.cpp index 312b10f28143..b9a46c5ce942 100644 --- a/mlir/unittests/ExecutionEngine/Invoke.cpp +++ b/mlir/unittests/ExecutionEngine/Invoke.cpp @@ -322,4 +322,55 @@ TEST(NativeMemRefJit, MAYBE_JITCallback) { ASSERT_EQ(elt, coefficient * count++); } +static int initCnt = 0; +// A helper function that will be called during the JIT's initialization. +static void initCallback() { initCnt += 1; } + +TEST(MLIRExecutionEngine, MAYBE_JITCallbackInGlobalCtor) { + auto tmBuilderOrError = llvm::orc::JITTargetMachineBuilder::detectHost(); + ASSERT_TRUE(!!tmBuilderOrError); + if (tmBuilderOrError->getTargetTriple().isAArch64()) { + GTEST_SKIP() << "Skipping global ctor initialization test on Aarch64 " + "because of bug #71963"; + return; + } + std::string moduleStr = R"mlir( + llvm.mlir.global_ctors ctors = [@ctor], priorities = [0 : i32], data = [#llvm.zero] + llvm.func @ctor() { + func.call @init_callback() : () -> () + llvm.return + } + func.func private @init_callback() attributes { llvm.emit_c_interface } + )mlir"; + + DialectRegistry registry; + registerAllDialects(registry); + registerBuiltinDialectTranslation(registry); + registerLLVMDialectTranslation(registry); + MLIRContext context(registry); + auto module = parseSourceString(moduleStr, &context); + ASSERT_TRUE(!!module); + ASSERT_TRUE(succeeded(lowerToLLVMDialect(*module))); + ExecutionEngineOptions jitOptions; + auto jitOrError = ExecutionEngine::create(*module, jitOptions); + ASSERT_TRUE(!!jitOrError); + // validate initialization is not run on construction + ASSERT_EQ(initCnt, 0); + auto jit = std::move(jitOrError.get()); + // Define any extra symbols so they're available at initialization. + jit->registerSymbols([&](llvm::orc::MangleAndInterner interner) { + llvm::orc::SymbolMap symbolMap; + symbolMap[interner("_mlir_ciface_init_callback")] = { + llvm::orc::ExecutorAddr::fromPtr(initCallback), + llvm::JITSymbolFlags::Exported}; + return symbolMap; + }); + jit->initialize(); + // validate the side effect of initialization + ASSERT_EQ(initCnt, 1); + // next initialization should be noop + jit->initialize(); + ASSERT_EQ(initCnt, 1); +} + #endif // _WIN32 diff --git a/offload/include/OpenMP/Mapping.h b/offload/include/OpenMP/Mapping.h index b9f5c1658293..93c1e56905ae 100644 --- a/offload/include/OpenMP/Mapping.h +++ b/offload/include/OpenMP/Mapping.h @@ -417,12 +417,42 @@ struct MapperComponentsTy { typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t, void *); +/// Structure to store information about a single ATTACH map entry. +struct AttachMapInfo { + void *PointerBase; + void *PointeeBegin; + int64_t PointerSize; + int64_t MapType; + map_var_info_t Pointername; + + AttachMapInfo(void *PointerBase, void *PointeeBegin, int64_t Size, + int64_t Type, map_var_info_t Name) + : PointerBase(PointerBase), PointeeBegin(PointeeBegin), PointerSize(Size), + MapType(Type), Pointername(Name) {} +}; + +/// Structure to track ATTACH entries and new allocations across recursive calls +/// (for handling mappers) to targetDataBegin for a given construct. +struct AttachInfoTy { + /// ATTACH map entries for deferred processing. + llvm::SmallVector AttachEntries; + + /// Key: host pointer, Value: allocation size. + llvm::DenseMap NewAllocations; + + AttachInfoTy() = default; + + // Delete copy constructor and copy assignment operator to prevent copying + AttachInfoTy(const AttachInfoTy &) = delete; + AttachInfoTy &operator=(const AttachInfoTy &) = delete; +}; + // Function pointer type for targetData* functions (targetDataBegin, // targetDataEnd and targetDataUpdate). typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **, void **, int64_t *, int64_t *, map_var_info_t *, void **, AsyncInfoTy &, - bool); + AttachInfoTy *, bool); void dumpTargetPointerMappings(const ident_t *Loc, DeviceTy &Device, bool toStdOut = false); @@ -431,20 +461,26 @@ int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, - bool FromMapper = false); + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, void **ArgsBase, void **Args, int64_t *ArgSizes, int64_t *ArgTypes, map_var_info_t *ArgNames, void **ArgMappers, AsyncInfoTy &AsyncInfo, + AttachInfoTy *AttachInfo = nullptr, bool FromMapper = false); +// Process deferred ATTACH map entries collected during targetDataBegin. +int processAttachEntries(DeviceTy &Device, AttachInfoTy &AttachInfo, + AsyncInfoTy &AsyncInfo); + struct MappingInfoTy { MappingInfoTy(DeviceTy &Device) : Device(Device) {} diff --git a/offload/include/device.h b/offload/include/device.h index f4b10abbaa3f..1e85bb1876c8 100644 --- a/offload/include/device.h +++ b/offload/include/device.h @@ -98,6 +98,10 @@ struct DeviceTy { int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, int64_t Size, AsyncInfoTy &AsyncInfo); + // Insert a data fence between previous data operations and the following + // operations if necessary for the device. + int32_t dataFence(AsyncInfoTy &AsyncInfo); + /// Notify the plugin about a new mapping starting at the host address /// \p HstPtr and \p Size bytes. int32_t notifyDataMapped(void *HstPtr, int64_t Size); diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h index 625bbaa0db85..8fd722bb1502 100644 --- a/offload/include/omptarget.h +++ b/offload/include/omptarget.h @@ -77,6 +77,9 @@ enum tgt_map_type { // the structured region // This is an OpenMP extension for the sake of OpenACC support. OMP_TGT_MAPTYPE_OMPX_HOLD = 0x2000, + // Attach pointer and pointee, after processing all other maps. + // Applicable to map-entering directives. Does not change ref-count. + OMP_TGT_MAPTYPE_ATTACH = 0x4000, // descriptor for non-contiguous target-update OMP_TGT_MAPTYPE_NON_CONTIG = 0x100000000000, // member of struct, member given by [16 MSBs] - 1 diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp index f88e30ae9e76..6585286bf428 100644 --- a/offload/libomptarget/device.cpp +++ b/offload/libomptarget/device.cpp @@ -191,6 +191,10 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr, DstPtr, Size, AsyncInfo); } +int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) { + return RTL->data_fence(RTLDeviceID, AsyncInfo); +} + int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) { DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n", DPxPTR(HstPtr), Size); diff --git a/offload/libomptarget/interface.cpp b/offload/libomptarget/interface.cpp index e9b148d8a260..fe1828976590 100644 --- a/offload/libomptarget/interface.cpp +++ b/offload/libomptarget/interface.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #ifdef OMPT_SUPPORT using namespace llvm::omp::target::ompt; @@ -165,12 +166,24 @@ targetData(ident_t *Loc, int64_t DeviceId, int32_t ArgNum, void **ArgsBase, OMPT_GET_RETURN_ADDRESS);) int Rc = OFFLOAD_SUCCESS; + + // Only allocate AttachInfo for targetDataBegin + std::unique_ptr AttachInfo; + if (TargetDataFunction == targetDataBegin) + AttachInfo = std::make_unique(); + Rc = TargetDataFunction(Loc, *DeviceOrErr, ArgNum, ArgsBase, Args, ArgSizes, ArgTypes, ArgNames, ArgMappers, AsyncInfo, - false /*FromMapper=*/); + AttachInfo.get(), /*FromMapper=*/false); - if (Rc == OFFLOAD_SUCCESS) - Rc = AsyncInfo.synchronize(); + if (Rc == OFFLOAD_SUCCESS) { + // Process deferred ATTACH entries BEFORE synchronization + if (AttachInfo && !AttachInfo->AttachEntries.empty()) + Rc = processAttachEntries(*DeviceOrErr, *AttachInfo, AsyncInfo); + + if (Rc == OFFLOAD_SUCCESS) + Rc = AsyncInfo.synchronize(); + } handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc); } diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp index 5b25d955dd32..32e89cc75efc 100644 --- a/offload/libomptarget/omptarget.cpp +++ b/offload/libomptarget/omptarget.cpp @@ -293,7 +293,8 @@ void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, void *ArgMapper, AsyncInfoTy &AsyncInfo, - TargetDataFuncPtrTy TargetDataFunction) { + TargetDataFuncPtrTy TargetDataFunction, + AttachInfoTy *AttachInfo = nullptr) { DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)); // The mapper function fills up Components. @@ -324,17 +325,178 @@ int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, MapperArgsBase.data(), MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(), MapperArgNames.data(), /*arg_mappers*/ nullptr, - AsyncInfo, /*FromMapper=*/true); + AsyncInfo, AttachInfo, /*FromMapper=*/true); return Rc; } +/// Utility function to perform a pointer attachment operation. +/// +/// For something like: +/// ```cpp +/// int *p; +/// ... +/// #pragma omp target enter data map(to:p[10:10]) +/// ``` +/// +/// for which the attachment operation gets represented using: +/// ``` +/// &p, &p[10], sizeof(p), ATTACH +/// ``` +/// +/// (Hst|Tgt)PtrAddr represents &p +/// (Hst|Tgt)PteeBase represents &p[0] +/// (Hst|Tgt)PteeBegin represents &p[10] +/// +/// This function first computes the expected TgtPteeBase using: +/// `